def extractembedded(outfp, fp, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) cwd = os.path.normpath(os.getcwd()) + '/' for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict): objtype = obj.get('Type', '') if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec': filename = obj['UF'] or obj['F'] fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise Exception( "unable to process PDF: reference for %s is not a PDFStream" % filename) if not isinstance( fileobj['Type'], PSLiteral ) or not fileobj['Type'].name == 'EmbeddedFile': raise Exception( "unable to process PDF: reference for %s is not an EmbeddedFile" % filename) print("extracting", filename) absfilename = os.path.normpath(os.path.abspath(filename)) if not absfilename.startswith(cwd): raise Exception( "filename %s is trying to escape to parent directories." % filename) dirname = os.path.dirname(absfilename) if not os.path.isdir(dirname): os.makedirs(dirname) # don't overwrite anything fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL) f = os.fdopen(fd, 'wb') f.write(fileobj.get_data()) f.close()
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = open(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno, page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw', 'binary'): outfp.write('\n')
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) if isinstance(obj, PDFStream) and codec == 'raw': outfp.write(obj.get_rawdata()) elif isinstance(obj, PDFStream) and codec == 'binary': outfp.write(obj.get_data()) else: dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) if isinstance(obj, PDFStream) and codec == 'raw': outfp.write(obj.get_rawdata()) elif isinstance(obj, PDFStream) and codec == 'binary': outfp.write(obj.get_data()) else: dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno, page) in enumerate(doc.get_pages()): if pageno in pagenos: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw', 'binary'): outfp.write('\n') return
def MapFactory(map_path): try: map_file = file(map_path, "rb") except: return None document = PDFDocument() try: parser = PDFParser(map_file) parser.set_document(document) document.set_parser(parser) document.initialize("") except: return None obj = document.getobj(_PDF_OBJ_INDEX_) if not obj or not isinstance(obj, PDFStream): return None if not "Width" in obj: return None if not "Height" in obj: return None if not "ColorSpace" in obj: return None width = obj["Width"] height = obj["Height"] map_class = None weird_pdf = height == 1 data = None if weird_pdf: data, height = _ProcessWeirdPDF(document) else: data = obj.get_data() if (width == MapA4Portrait.WIDTH and height == MapA4Portrait.HEIGHT): map_class = MapA4Portrait elif (width == MapA4Landscape.WIDTH and height == MapA4Landscape.HEIGHT): map_class = MapA4Landscape elif (width == MapA3Portrait.WIDTH and height == MapA3Portrait.HEIGHT): map_class = MapA3Portrait elif (width == MapA3Landscape.WIDTH and height == MapA3Landscape.HEIGHT): map_class = MapA3Landscape elif (width == MapA2Portrait.WIDTH and height == MapA2Portrait.HEIGHT): map_class = MapA2Portrait elif (width == MapA2Landscape.WIDTH and height == MapA2Landscape.HEIGHT): map_class = MapA2Landscape elif (width == MapA1Portrait.WIDTH and height == MapA1Portrait.HEIGHT): map_class = MapA1Portrait elif (width == MapA1Landscape.WIDTH and height == MapA1Landscape.HEIGHT): map_class = MapA1Landscape else: return None return map_class(_MakePPMImage(width, height, data), map_path)
def MapFactory(map_path): try: map_file = file(map_path, "rb") except: return None document = PDFDocument() try: parser = PDFParser(map_file) parser.set_document(document) document.set_parser(parser) document.initialize("") except: return None # The image object on all IBGE PDFs is indexed # at ID 6. We also probe for a few properties. obj = document.getobj(6) if not obj or not isinstance(obj, PDFStream): return None if not "Width" in obj: return None if not "Height" in obj: return None if not "ColorSpace" in obj: return None width = obj["Width"] height = obj["Height"] map_class = None if (width == MapA4Portrait.WIDTH and height == MapA4Portrait.HEIGHT): map_class = MapA4Portrait elif (width == MapA4Landscape.WIDTH and height == MapA4Landscape.HEIGHT): map_class = MapA4Landscape elif (width == MapA3Portrait.WIDTH and height == MapA3Portrait.HEIGHT): map_class = MapA3Portrait elif (width == MapA3Landscape.WIDTH and height == MapA3Landscape.HEIGHT): map_class = MapA3Landscape elif (width == MapA2Portrait.WIDTH and height == MapA2Portrait.HEIGHT): map_class = MapA2Portrait elif (width == MapA2Landscape.WIDTH and height == MapA2Landscape.HEIGHT): map_class = MapA2Landscape elif (width == MapA1Portrait.WIDTH and height == MapA1Portrait.HEIGHT): map_class = MapA1Portrait elif (width == MapA1Landscape.WIDTH and height == MapA1Landscape.HEIGHT): map_class = MapA1Landscape else: return None return map_class(_MakePPMImage(width, height, obj.get_data()), map_path)
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) cwd = os.path.normpath(os.getcwd()) + '/' for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict): objtype = obj.get('Type', '') if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec': filename = obj['UF'] or obj['F'] fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename)) if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile': raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename)) print "extracting", filename absfilename = os.path.normpath(os.path.abspath(filename)) if not absfilename.startswith(cwd): raise Exception("filename %s is trying to escape to parent directories.." % (filename)) dirname = os.path.dirname(absfilename) if not os.path.isdir(dirname): os.makedirs(dirname) # don't overwrite anything fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL) f = os.fdopen(fd, 'wb') f.write(fileobj.get_data()) f.close()
class PDFMine: def __init__(self, filename): self.result = {} self.filename = filename self.fp = open(filename, "rb") self.parser = PDFParser(self.fp) self.doc = PDFDocument() self.parser.set_document(self.doc) self.doc.set_parser(self.parser) self.doc.initialize() self.pagecount = self.pgcount() print "Page count %i" % self.pagecount if self.doc.is_extractable: print "Starting extraction of %s" % self.filename else: print "Oops, error extracting %s" % self.filename raise () def close(self): self.fp.close() def pgcount(self): count = 0 for page in self.doc.get_pages(): count = count + 1 return count def save_video(self, targetdir): """Saves all your videos to targetdir """ for page in self.doc.get_pages(): if (page.annots): obj = self.doc.getobj(page.annots.objid) for i in obj: annotobj = i.resolve() try: if (annotobj["Subtype"].name == 'RichMedia'): linktype = "media" data = annotobj["RichMediaContent"].resolve() dataobj = data["Assets"].resolve() fstream = dataobj["Names"][1].resolve() filename = fstream["F"] fdata = fstream['EF']['F'].resolve().get_data() f = open(os.path.join(targetdir, filename), "w") f.write(fdata) f.close() except: pass def _rect(self, bbox): """ Changes a bounding box into something we can use with HTML (x,y,width,height measured from top left) """ pgbox = self.pgbox pgwidth = round(abs(pgbox[0] - pgbox[2])) pgheight = round(abs(pgbox[1] - pgbox[3])) x = round(min(bbox[0], bbox[2])) y = pgheight - (round(max(bbox[1], bbox[3]))) width = round(max(bbox[0], bbox[2]) - min(bbox[0], bbox[2])) height = round(max(bbox[1], bbox[3]) - min(bbox[1], bbox[3])) result = {"x": x, "y": y, "width": width, "height": height} return result def _find_objid_pgnum(self, obj): """Given a page, return the page number """ i = 0 for page in self.doc.get_pages(): i = i + 1 if self.doc.getobj(page.pageid) == obj: return i return False def parse_pages(self): result = [] i = 0 for page in self.doc.get_pages(): self.pgbox = page.mediabox i = i + 1 print "==== Page %d ====" % i result.append(self._parse_page(page)) return result def _parse_page(self, page): result = [] vids = self._parse_video(page) if len(vids) > 0: result.extend(self._parse_video(page)) links = self._parse_links(page) if len(links) > 0: result.extend(links) comments = self._parse_comments(page) if len(comments) > 0: result.extend(comments) return result def _parse_comments(self, page): result = [] rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) layout = device.get_result() for obj in layout: if isinstance(obj, LTTextBox): txt = obj.get_text() if (txt.find("[[") >= 0): """ We've found a comment. If it's on top of a rect, return the rect as the bounding box. Else return just the textbox rect """ rect = self._rect(self._intersects(layout, obj)) commenttxt = { "rect": rect, "comment": txt.replace("]]", "").replace("[[", "") } result.append(commenttxt) return result def _parse_links(self, page): result = [] if (page.annots): obj = self.doc.getobj(page.annots.objid) for i in obj: annotobj = i.resolve() try: if (annotobj["Subtype"].name == 'Link') and (annotobj.has_key("A")): linktype = "link" print "Found link" obj = annotobj["A"].resolve() dest = "" if (obj.has_key('D')): linktype = "bookmark" #print dir(obj["D"]) namesobj = self.doc.catalog["Names"].resolve() destsobj = namesobj["Dests"].resolve() for name in destsobj["Names"]: if (hasattr(name[0], "objid")): pg = name[0].resolve() dest = self._find_objid_pgnum(pg) if (obj.has_key('URI')): dest = obj['URI'] rect = self._rect(annotobj['Rect']) link = {"rect": rect, "type": linktype, "dest": dest} result.append(link) except: return result return result def _parse_video(self, page): result = [] if (page.annots): obj = self.doc.getobj(page.annots.objid) for i in obj: annotobj = i.resolve() try: if (annotobj["Subtype"].name == 'RichMedia'): linktype = "media" rect = self._rect(annotobj['Rect']) print "Found video" data = annotobj["RichMediaContent"].resolve() dataobj = data["Assets"].resolve() fstream = dataobj["Names"][1].resolve() filename = fstream["F"] link = { "rect": rect, "type": linktype, "filename": filename } result.append(link) except: pass return result def _intersects(self, layout, obj): """ Finds if the obj is contained within another object on the page """ origbbox = obj.bbox for otherobj in layout: if obj != otherobj: otherbbox = otherobj.bbox if (origbbox[0] >= otherbbox[0]) and (origbbox[1] >= otherbbox[1]) and ( origbbox[2] <= otherbbox[2]) and (origbbox[3] >= otherbbox[3]): return otherbbox return origbbox """ We search for 'bookmarks' set in Adobe Acrobat """ def get_sections(self): toc = [] try: outlines = self.doc.get_outlines() for (level, title, dest, a, se) in outlines: if (dest): objid = dest[0].objid pgobj = dest[0].resolve() else: destsobj = a.resolve() pgobj = destsobj["D"][0] objid = pgobj.objid x = 1 for page in self.doc.get_pages(): if page.pageid == objid: toc.append({ "name": title, "page": x }) x = x + 1 except: pass return toc def test(self): print "Starting test on %s" % self.filename result = self.parse_pages() print result print "Found %d pages" % (self.pagecount) print self.get_sections()
class PDFMine: def __init__(self, filename): self.result = {} self.filename=filename self.fp=open(filename, "rb") self.parser=PDFParser(self.fp) self.doc=PDFDocument() self.parser.set_document(self.doc) self.doc.set_parser(self.parser) self.doc.initialize() self.pagecount=self.pgcount() print "Page count %i" % self.pagecount if self.doc.is_extractable: print "Starting extraction of %s" % self.filename else: print "Oops, error extracting %s" % self.filename raise() def close(self): self.fp.close() def pgcount(self): count=0; for page in self.doc.get_pages(): count=count+1 return count def save_video(self, targetdir): """Saves all your videos to targetdir """ for page in self.doc.get_pages(): if (page.annots): obj=self.doc.getobj(page.annots.objid) for i in obj: annotobj=i.resolve() try: if (annotobj["Subtype"].name=='RichMedia'): linktype="media" data=annotobj["RichMediaContent"].resolve() dataobj=data["Assets"].resolve() fstream=dataobj["Names"][1].resolve() filename=fstream["F"] fdata=fstream['EF']['F'].resolve().get_data() f=open(os.path.join(targetdir,filename),"w") f.write(fdata) f.close() except: pass def _rect(self, bbox): """ Changes a bounding box into something we can use with HTML (x,y,width,height measured from top left) """ pgbox=self.pgbox pgwidth=round(abs(pgbox[0]-pgbox[2])) pgheight=round(abs(pgbox[1]-pgbox[3])) x=round(min(bbox[0], bbox[2])) y=pgheight-(round(max(bbox[1],bbox[3]))) width=round(max(bbox[0], bbox[2])-min(bbox[0], bbox[2])) height=round(max(bbox[1], bbox[3])-min(bbox[1], bbox[3])) result={"x":x, "y":y, "width":width, "height":height} return result def _find_objid_pgnum(self, obj): """Given a page, return the page number """ i=0 for page in self.doc.get_pages(): i=i+1 if self.doc.getobj(page.pageid)==obj: return i return False def parse_pages(self): result=[] i=0 for page in self.doc.get_pages(): self.pgbox=page.mediabox i=i+1 print "==== Page %d ====" % i result.append(self._parse_page(page)) return result def _parse_page(self, page): result=[] vids=self._parse_video(page) if len(vids)>0: result.extend(self._parse_video(page)) links=self._parse_links(page) if len(links)>0: result.extend(links) comments=self._parse_comments(page) if len(comments)>0: result.extend(comments) return result def _parse_comments(self, page): result=[] rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) layout = device.get_result() for obj in layout: if isinstance(obj, LTTextBox): txt=obj.get_text() if (txt.find("[[")>=0): """ We've found a comment. If it's on top of a rect, return the rect as the bounding box. Else return just the textbox rect """ rect=self._rect(self._intersects(layout,obj)) commenttxt={"rect":rect, "comment":txt.replace("]]","").replace("[[","")} result.append(commenttxt) return result def _parse_links(self, page): result=[] if (page.annots): obj=self.doc.getobj(page.annots.objid) for i in obj: annotobj=i.resolve() try: if (annotobj["Subtype"].name=='Link') and (annotobj.has_key("A")): linktype="link" print "Found link" obj=annotobj["A"].resolve() dest="" if (obj.has_key('D')): linktype="bookmark" #print dir(obj["D"]) namesobj=self.doc.catalog["Names"].resolve() destsobj=namesobj["Dests"].resolve() for name in destsobj["Names"]: if (hasattr(name[0], "objid")): pg=name[0].resolve() dest=self._find_objid_pgnum(pg) if (obj.has_key('URI')): dest=obj['URI'] rect=self._rect(annotobj['Rect']) link={"rect":rect, "type":linktype,"dest": dest} result.append(link) except: return result return result def _parse_video(self, page): result=[] if (page.annots): obj=self.doc.getobj(page.annots.objid) for i in obj: annotobj=i.resolve() try: if (annotobj["Subtype"].name=='RichMedia'): linktype="media" rect=self._rect(annotobj['Rect']) print "Found video" data=annotobj["RichMediaContent"].resolve() dataobj=data["Assets"].resolve() fstream=dataobj["Names"][1].resolve() filename=fstream["F"] link={"rect":rect, "type":linktype, "filename":filename} result.append(link) except: pass return result def _intersects(self, layout, obj): """ Finds if the obj is contained within another object on the page """ origbbox=obj.bbox for otherobj in layout: if obj!=otherobj: otherbbox=otherobj.bbox if (origbbox[0]>=otherbbox[0]) and (origbbox[1]>=otherbbox[1]) and (origbbox[2]<=otherbbox[2]) and (origbbox[3]>=otherbbox[3]): return otherbbox return origbbox """ We search for 'bookmarks' set in Adobe Acrobat """ def get_sections(self): toc=[] try: outlines = self.doc.get_outlines() for (level,title,dest,a,se) in outlines: if (dest): objid=dest[0].objid pgobj=dest[0].resolve() else: destsobj=a.resolve() pgobj=destsobj["D"][0] objid=pgobj.objid x=1; for page in self.doc.get_pages(): if page.pageid==objid: toc.append({"name": title, "page": x}); x=x+1 except: pass return toc def test(self): print "Starting test on %s" % self.filename result=self.parse_pages() print result print "Found %d pages" % (self.pagecount) print self.get_sections()
pr = obj["P"] elif obj.get("Type") and obj["Type"].name == "Annot": pages.append(objid) try: pi = pages.index(pr.objid) + 1 except: pi = -1 print(objid, pi, obj["Subj"], obj["T"], obj["Contents"]) fp = open("simple1.pdf", "rb") parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(pdf_pwd) visited = set() for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) try: obj = doc.getobj(objid) if obj is None: continue extract(objid, obj) print("oldu.") except: print(sys.stderr, "not found: %r")
def cargoDoc(): fp = open(r"C:\Users\ssleep\Documents\Programming\Cargo Docker\Thursday\LCBO\601331975 PARS MANIFESTS.pdf", 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') visited = set() pars = [] for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) obj = doc.getobj(objid) if obj is None: continue pars = extract(objid,obj) pdfFileObj = open(specificPath, 'rb') pdfReader = PdfFileReader(pdfFileObj) fields = pdfReader.getFields() # print(len(fields)-15) for i in range(len(fields)-15): containerNumber = "" weight = "" consignee = "" shipper = "" eta = "" portOfLoading = "" portOfDischarge = "" description = "" if i == 0: # prefix = str(i) + "." containerNumber = fields["Container Row1"].value weight = float(fields["Weight KGRow1"].value) consignee = fields["Consignee"].value shipper = fields["Shipper"].value eta = fields["ETA DATE"].value portOfLoading = fields["undefined"].value portOfDischarge = fields["Port of Discharge"].value description = fields["Description of goods"].value else: for j in list(fields.keys()): if j==str(i): for k in list(fields[j]["/Kids"]): try: if(k.getObject()['/T']=="WO"): wo=k.getObject()['/V'] elif(k.getObject()['/T']=="Container Row1"): containerNumber=k.getObject()['/V'] elif(k.getObject()['/T']=="SizeRow1"): size=k.getObject()['/V'] elif(k.getObject()['/T']=="Weight KGRow1"): weight=float(k.getObject()['/V']) elif(k.getObject()['/T']=="Consignee"): consignee=k.getObject()['/V'] elif(k.getObject()['/T']=="Shipper"): shipper=k.getObject()['/V'] elif(k.getObject()['/T']=="ETA DATE"): eta=k.getObject()['/V'] elif(k.getObject()['/T']=="undefined"): portOfLoading=k.getObject()['/V'] elif(k.getObject()['/T']=="Port of Discharge"): portOfDischarge=k.getObject()['/V'] elif(k.getObject()['/T']=="Description of goods"): description=k.getObject()['/V'] except KeyError: True