def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print('extracting: %r' % path, file=sys.stderr) out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) fp.close() return
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def extractembedded(fname, password='', extractdir=None, emailsDir=None): def extract1(obj): filename = os.path.basename(obj['F']) # filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) file_name, extension = os.path.splitext(fname) path = os.path.join(extractdir, file_name + " " + filename) while os.path.exists(path): path = os.path.join( extractdir, file_name + " " + str(randint(1, 100)) + " " + filename) print >> sys.stderr, "file exists, create random name %s" % path # print >>sys.stderr, 'extracting: %r' % path out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = file(os.path.join(emailsDir, fname), 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: if type(xref ) == PDFXRef: # Ignore PDFXreffallback. Not sure what it is. for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) return
cmd_args = virtual_environment(parser) from pdfminer3.pdfdocument import PDFDocument from pdfminer3.pdftypes import PDFObjectNotFound from pdfminer3.pdfparser import PDFParser, PDFStream print(cmd_args.file_name) input_file = open(cmd_args.file_name, "rb") parsed = PDFDocument(PDFParser(input_file)) try: shutil.rmtree('%s.pdfminer_out' % cmd_args.file_name) except FileNotFoundError: pass os.mkdir('%s.pdfminer_out' % cmd_args.file_name) for obj_id in set(obj_id for xref in parsed.xrefs for obj_id in xref.get_objids()): try: obj = parsed.getobj(obj_id) except PDFObjectNotFound: continue if not isinstance(obj, PDFStream): continue print('%s' % obj) obj.decode() length = obj.attrs.get('Length', '') output_file = open( '%s.pdfminer_out/pdf_%07d_0.dat' % (cmd_args.file_name, obj_id), 'wb') output_file.write(obj.data) output_file.close() toc = time() print('\nExecution time: %s sec.' % (toc - tic))