def pdf_compress(data): """ take a pdf data string, return a compressed string compression is done using ps2pdf14 in ghostscript """ f = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') f.write(data) f.close() f2 = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') f2.close() ret = os.system('ps2pdf14 "{0}" "{1}"'.format(f.name, f2.name)) if ret != 0: log_err("Compress: ps2pdf14 failed!") newdata = None else: newdata = open(f2.name).read() file_succ = newdata is not None and \ check_file_type(f2.name, 'PDF document') and \ len(newdata) >= ukconfig.FILE_SIZE_MINIMUM try: os.remove(f2.name) os.remove(f.name) except OSError: pass if file_succ and \ len(newdata) < len(data): log_info("Compress succeed: {0}->{1}".format( parse_file_size(len(data)), parse_file_size(len(newdata)))) return newdata else: return data
def requests_download(url, progress_updater, headers=None): resp = requests.get(url, stream=True, headers=headers) total_length = resp.headers.get('content-length') if total_length is None: data = resp.content progress_updater.finish(data) return data else: total_length = int(total_length) if total_length < ukconfig.FILE_SIZE_MINIMUM: raise FileCorrupted("File too small: " + parse_file_size(total_length)) if total_length > ukconfig.FILE_SIZE_MAXIMUM: raise FileCorrupted("File too large: " + parse_file_size(total_length)) progress_updater.set_total(total_length) dl = 0 ret = "" for data in resp.iter_content(): dl += len(data) ret += data progress_updater.update(dl) progress_updater.finish(data) return ret
def do_compress(data, pid): """ this *must* succeed adding the pdf""" try: # compress data = pdf_compress(data) except: pass db = get_mongo('paper') db.update({'_id': pid}, {'$set': {'pdf': Binary(data)}}) log_info("Updated pdf {0}: size={1}".format(pid, parse_file_size(len(data)))) return data
def do_compress(data, pid): """ this *must* succeed adding the pdf""" try: # compress data = pdf_compress(data) except: pass db = get_mongo('paper') db.update({'_id': pid}, {'$set': {'pdf': Binary(data)}} ) log_info("Updated pdf {0}: size={1}".format( pid, parse_file_size(len(data)))) return data
def direct_download(url, progress_updater, headers=None): """ download with methods given by ukconfig.download_method return the data """ log_info("Directly Download with URL {0} ...".format(url)) if headers is None: headers = {'Host': urlparse(url).netloc, 'User-Agent': ukconfig.USER_AGENT, 'Connection': 'Keep-Alive' } # for test and cmd tools only if ukconfig.download_method == 'wget': data = wget_download(url, progress_updater, headers) else: data = requests_download(url, progress_updater, headers) if len(data) < ukconfig.FILE_SIZE_MINIMUM: raise FileCorrupted("File too small: " + parse_file_size(len(data))) return data
def set_total(self, size): """size: number of bytes""" log_info("File size is {0}".format(parse_file_size(size))) self.total = size