def parse(self): fp = file(self.pdf, 'rb') parser = PDFParser(fp, dbg=self.debug) doc = PDFDocument(parser, dbg=self.debug) # extract blob of data after EOF (if it exists) if doc.found_eof and doc.eof_distance > 3: self.bin_blob = parser.read_from_end(doc.eof_distance) res = '<pdf>' visited = set() # keep track of the objects already visited for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) try: obj = doc.getobj(objid) res += '<object id="' + str(objid) + '">\n' res += self.dump(obj) res += '\n</object>\n\n' except PDFObjectNotFound as e: mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096) mal_obj = mal_obj.replace('<', '0x3C') res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % ( objid, mal_obj) self.takenote(self.malformed, 'objects', objid) except Exception as e: res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % ( objid, e.message) fp.close() res += self.dumptrailers(doc) res += '</pdf>' self.xml = res self.errors = doc.errors self.bytes_read = parser.BYTES return
def parse (self): fp = file(self.pdf, 'rb') parser = PDFParser(fp, dbg=self.debug) doc = PDFDocument(parser, dbg=self.debug) #extract blob of data after EOF (if it exists) if doc.found_eof and doc.eof_distance > 3: self.bin_blob = parser.read_from_end(doc.eof_distance) res = '<pdf>' visited = set() #keep track of the objects already visited for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue if objid == 21 or objid == 67: print objid visited.add(objid) try: obj = doc.getobj(objid) res += '<object id="' + str(objid) + '">\n' res += self.dump(obj) res += '\n</object>\n\n' except PDFObjectNotFound as e: mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096) mal_obj = mal_obj.replace('<', '0x3C') res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj) self.takenote(self.malformed, 'objects', objid) except Exception as e: res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message) fp.close() res += self.dumptrailers(doc) res += '</pdf>' self.xml=res self.errors = doc.errors self.bytes_read = parser.BYTES return