def parse(self):
     fp = file(self.pdf, 'rb')
     parser = PDFParser(fp, dbg=self.debug)
     doc = PDFDocument(parser, dbg=self.debug)
     # extract blob of data after EOF (if it exists)
     if doc.found_eof and doc.eof_distance > 3:
         self.bin_blob = parser.read_from_end(doc.eof_distance)
     res = '<pdf>'
     visited = set()  # keep track of the objects already visited
     for xref in doc.xrefs:
         for objid in xref.get_objids():
             if objid in visited:
                 continue
             visited.add(objid)
             try:
                 obj = doc.getobj(objid)
                 res += '<object id="' + str(objid) + '">\n'
                 res += self.dump(obj)
                 res += '\n</object>\n\n'
             except PDFObjectNotFound as e:
                 mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
                 mal_obj = mal_obj.replace('<', '0x3C')
                 res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (
                     objid, mal_obj)
                 self.takenote(self.malformed, 'objects', objid)
             except Exception as e:
                 res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (
                     objid, e.message)
     fp.close()
     res += self.dumptrailers(doc)
     res += '</pdf>'
     self.xml = res
     self.errors = doc.errors
     self.bytes_read = parser.BYTES
     return
 def parse (self):
     fp = file(self.pdf, 'rb')
     parser = PDFParser(fp, dbg=self.debug)
     doc = PDFDocument(parser, dbg=self.debug)
     #extract blob of data after EOF (if it exists)
     if doc.found_eof and doc.eof_distance > 3:
         self.bin_blob = parser.read_from_end(doc.eof_distance)
     res = '<pdf>'
     visited = set() #keep track of the objects already visited
     for xref in doc.xrefs:
         for objid in xref.get_objids():
             if objid in visited:
                 continue
             if objid == 21 or objid == 67:
                 print objid
             visited.add(objid)
             try:
                 obj = doc.getobj(objid)
                 res += '<object id="' + str(objid) + '">\n'
                 res += self.dump(obj)
                 res += '\n</object>\n\n'
             except PDFObjectNotFound as e:
                 mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
                 mal_obj = mal_obj.replace('<', '0x3C')
                 res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
                 self.takenote(self.malformed, 'objects', objid)
             except Exception as e:
                 res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
     fp.close()
     res += self.dumptrailers(doc)
     res += '</pdf>'
     self.xml=res
     self.errors = doc.errors
     self.bytes_read = parser.BYTES
     return