def merge_pdf(inputs): inputs = [to_stream(inp) for inp in inputs] assert all(inputs), 'No inputs? %r' % inputs if not inputs: return None with mktempfn('-M.pdf', 'm-') as out_fn: try: simple_merge(out_fn, inputs) except Exception, e: LOG.error('simple merge cannot merge %s: %s', inputs, e) shaved = [] for inp in inputs: with mktempfn('-shaved.pdf', delete_asap=False) as shvd: shaved.append(pdftk_merge(shvd, [inp])) try: simple_merge(out_fn, map(to_stream, shaved)) except Exception, e: LOG.error('simple merge cannot merge shaved %s: %s', shaved, e) pdftk_merge(out_fn, shaved) map(unlink, shaved)
def to_filename(stream, opened_ok=True): if isinstance(stream, basestring): if file_exists(stream): return stream with mktempfn('-TF.pdf', 'tof-', delete_asap=False) as fn: open(fn, 'wb').write(stream) return fn if opened_ok and hasattr(stream, 'name') and file_exists(stream.name): return stream.name if hasattr(stream, 'seek'): stream.seek(0, 0) else: LOG.warn('no seek on %r!', stream) with mktempfn('-TF.pdf', 'tof-', delete_asap=False) as fn: with open(fn, 'wb') as fh: while True: chunk = stream.read(65536) if not chunk: break fh.write(chunk) return fn
def to_stream(stream): if not hasattr(stream, 'seek'): if file_exists(stream): stream = MMFile(stream, 'rb', delete_on_close=False) else: with mktempfn('-TS.pdf', 'tos-') as fn: open(fn, 'wb').write(stream) stream = MMFile(fn, 'rb', delete_on_close=True) stream.seek(0, 2) if stream.tell() == 0: LOG.warn('zero pdf !? (%r)', repr(stream)[:100]) stream.seek(0, 0) return stream
def simple_split(stream): stream_s = repr(stream)[:100] pages = list(get_pages(stream)) for page in pages: out = PdfFileWriter() out.addPage(page) writer = timeouter(5, threaded=False)(out.write) with mktempfn('-SS.pdf', 'ss-') as fn: with open(fn, 'wb') as fh: writer(fh) LOG.debug('split_pdf(%s) yields %s', stream_s, fn) yield MMFile(fn, 'rb', delete_on_close=True)
def get_pages(stream): stream_s = repr(stream)[:100] from pyPdf import PdfFileReader # LOG.debug('stream: %s (%s)', stream_s, dir(stream)) if not hasattr(stream, 'seek'): with mktempfn('-GP.pdf', 'gp-') as fn: with open(fn, 'wb') as fh: fh.write(stream) stream = MMFile(fn, 'rb', delete_on_close=True) stream.seek(0, 2) if stream.tell() == 0: LOG.warn('zero pdf !? (%r)', stream_s) return stream.seek(0, 0) for i, page in enumerate(PdfFileReader(stream).pages): LOG.debug('get_pages(%s) yields %d (%s)', stream_s, i, repr(page)[:100]) yield page LOG.info('%r is %d pages', stream_s, i)
def clean_pdf(pdffn): LOG.info('cleaning %r', pdffn) with mktempfn('-pp.ps') as psfn: psfn = pdftops(psfn, [pdffn]) pdffn = pstopdf(pdffn, [psfn]) return pdffn
else: LOG.warn('no seek on %r!', stream) with mktempfn('-TF.pdf', 'tof-', delete_asap=False) as fn: with open(fn, 'wb') as fh: while True: chunk = stream.read(65536) if not chunk: break fh.write(chunk) return fn if '__main__' == __name__: logging.basicConfig(level=logging.DEBUG) todo, args = sys.argv[1], sys.argv[2:] if 'merge' == todo: sys.stdout.write(merge_pdf(args).getvalue()) elif 'split' == todo: import zipfile with mktempfn('.zip') as zfn: zfh = zipfile.ZipFile(zfn, 'w', zipfile.ZIP_DEFLATED) for fh in split_pdf(args[0]): zfh.writestr(os.path.basename(fh.name), fh.getvalue()) zfh.close() with open(zfn, 'rb') as zfh: while True: chunk = zfh.read(65536) if not chunk: break sys.stdout.write(chunk)