def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def parse (self): fp = file(self.pdf, 'rb') parser = PDFParser(fp, dbg=self.debug) doc = PDFDocument(parser, dbg=self.debug) #extract blob of data after EOF (if it exists) if doc.found_eof and doc.eof_distance > 3: self.bin_blob = parser.read_from_end(doc.eof_distance) res = '<pdf>' visited = set() #keep track of the objects already visited for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue if objid == 21 or objid == 67: print objid visited.add(objid) try: obj = doc.getobj(objid) res += '<object id="' + str(objid) + '">\n' res += self.dump(obj) res += '\n</object>\n\n' except PDFObjectNotFound as e: mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096) mal_obj = mal_obj.replace('<', '0x3C') res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj) self.takenote(self.malformed, 'objects', objid) except Exception as e: res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message) fp.close() res += self.dumptrailers(doc) res += '</pdf>' self.xml=res self.errors = doc.errors self.bytes_read = parser.BYTES return
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print >>sys.stderr, 'extracting: %r' % path out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) fp.close() return
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print >>sys.stderr, 'extracting: %r' % path out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) return
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def dumppdf(fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) res = "" if objids: for objid in objids: obj = doc.getobj(objid) res += dumpxml(obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) res += dumpxml( obj, codec=codec) else: res += dumpxml(page.attrs) #print "before dumpall" if dumpall: res += dumpallobjs( doc, codec=codec) #print "after dumpall" if (not objids) and (not pagenos) and (not dumpall): res += dumptrailers( doc) fp.close() if codec not in ('raw','binary'): res += '\n' #print "end proc" return res
def print_all_obj(filename): with file(filename, 'rb') as f: parser = PDFParser(f) doc = PDFDocument(parser, None) visited_objids = set() for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited_objids: continue visited_objids.add(objid) print objid, get_obj_type(doc.getobj(objid))
def extractembedded( outfp, fname, objids, pagenos, password="", dumpall=False, codec=None, extractdir=None, ): def extract1(objid, obj): filename = os.path.basename(obj.get("UF") or obj.get("F").decode()) fileref = obj["EF"].get("UF") or obj["EF"].get("F") fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): error_msg = ( "unable to process PDF: reference for %r is not a " "PDFStream" % filename ) raise PDFValueError(error_msg) if fileobj.get("Type") is not LITERAL_EMBEDDEDFILE: raise PDFValueError( "unable to process PDF: reference for %r " "is not an EmbeddedFile" % (filename) ) path = os.path.join(extractdir, "%.6d-%s" % (objid, filename)) if os.path.exists(path): raise IOError("file exists: %r" % path) print("extracting: %r" % path) os.makedirs(os.path.dirname(path), exist_ok=True) out = open(path, "wb") out.write(fileobj.get_data()) out.close() return with open(fname, "rb") as fp: parser = PDFParser(fp) doc = PDFDocument(parser, password) extracted_objids = set() for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if ( objid not in extracted_objids and isinstance(obj, dict) and obj.get("Type") is LITERAL_FILESPEC ): extracted_objids.add(objid) extract1(objid, obj) return
def _run(self, scanObject, result, depth, args): moduleResult = [] pdfBuffer = cStringIO.StringIO(scanObject.buffer) try: pdfFile = PdfFileReader(pdfBuffer) docInfo = pdfFile.getDocumentInfo() for metaItem in docInfo: scanObject.addMetadata(self.module_name, metaItem[1:], str(docInfo[metaItem])) pdf = PDFParser(pdfBuffer) pdfDoc = PDFDocument(pdf) for xref in pdfDoc.xrefs: for objid in xref.get_objids(): try: obj = pdfDoc.getobj(objid) if isinstance(obj, dict): for (key, val) in obj.iteritems(): if key in ['AA', 'OpenAction']: scanObject.addFlag('pdf:nfo:auto_action') elif key in ['JS', 'Javascript']: scanObject.addFlag('pdf:nfo:js_embedded') if isinstance(obj, PDFStream): if 'Type' in obj.attrs and obj.attrs[ 'Type'] == LIT('EmbeddedFile'): moduleResult.append( ModuleObject( buffer=obj.get_data(), externalVars=ExternalVars( filename='e_pdf_stream_%s' % objid))) except PDFObjectNotFound: scanObject.addFlag('pdf:err:missing_object_%s' % objid) except ScanError: raise except PSEOF: scanObject.addFlag('pdf:err:unexpected_eof') except ScanError: raise return moduleResult
def extract(self): with open(self._pdfname, 'rb') as f: parser = PDFParser(f) document = PDFDocument(parser, '') visited = set() for xref in document.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) try: obj = document.getobj(objid) if not isinstance(obj, dict): continue self._extract(objid, obj) except PDFObjectNotFound as e: pass return self.comments
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(objid, obj): filename = os.path.basename(obj.get('UF') or obj.get('F').decode()) fileref = obj['EF'].get('UF') or obj['EF'].get('F') fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): error_msg = 'unable to process PDF: reference for %r is not a ' \ 'PDFStream' % filename raise PDFValueError(error_msg) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError('unable to process PDF: reference for %r ' 'is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, '%.6d-%s' % (objid, filename)) if os.path.exists(path): raise IOError('file exists: %r' % path) print('extracting: %r' % path) os.makedirs(os.path.dirname(path), exist_ok=True) out = open(path, 'wb') out.write(fileobj.get_data()) out.close() return with open(fname, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser, password) extracted_objids = set() for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if objid not in extracted_objids and isinstance(obj, dict) \ and obj.get('Type') is LITERAL_FILESPEC: extracted_objids.add(objid) extract1(objid, obj) return
def extractComments(fp): parser = PDFParser(fp) doc = PDFDocument(parser, "") visited = set() pages = [] resultList = [] def extract(objid, obj): result = None if isinstance(obj, dict): # 'Type' is PDFObjRef type if obj.has_key('Type') and obj['Type'].name == 'Page': pages.append(objid) elif obj.has_key('C'): try: pr = obj['P'] pi = pages.index(pr.objid)+1 except: pi = -1 try: result = (fp.name, objid, pi, obj['Subtype'].name, obj['Subj'],obj['T'],obj['Contents']) except: # if any of the listed entries do not exist, ignore #print(objid, pi, obj['Subtype'].name) result = () return result for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) try: obj = doc.getobj(objid) if obj is None: continue r= extract(objid,obj) if r: resultList.append(r) except PDFObjectNotFound, e: print >>sys.stderr, 'not found: %r' % e
def xmlFromPdf(pdfpath, xmlpath=None): with open(pdfpath, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) all_objids = set(objid for xref in doc.xrefs for objid in xref.get_objids()) for objid in all_objids: obj = doc.getobj(objid) if not isinstance(obj, PDFStream): continue data = obj.get_data() if b'xfa-template' in data: break else: msg = 'Cannot find form data in %s' % pdfpath raise CrypticXml(msg) # data == <form>-text.xml tree = etree.fromstring(data) if xmlpath is not None: with open(xmlpath, 'wb') as out: out.write(etree.tostring(tree, pretty_print=True)) return tree
def dumpallobjs(out: TextIO, doc: PDFDocument, codec: Optional[str] = None, show_fallback_xref: bool = False) -> None: visited = set() out.write('<pdf>') for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) try: obj = doc.getobj(objid) if obj is None: continue out.write('<object id="%d">\n' % objid) dumpxml(out, obj, codec=codec) out.write('\n</object>\n\n') except PDFObjectNotFound as e: print('not found: %r' % e) dumptrailers(out, doc, show_fallback_xref) out.write('</pdf>') return
def parse_pdf(fname, target_width=None, with_links=False, collection_name=None, score_name=None): """Extracts the notehead centroid coordinates from the given LilyPond-generated PDF file, notehead bounding box coordinates, and builds the corresponding CropObjects. :param fname: Name of the input PDF file. Assumes it was generated from LilyPond with the option ``-e"(set-option 'point-and-click '(note-event))"``. :param target_width: The width of an image against which we want the centroids to be valid. Based on the PDF page size(s), the function will compute a ratio by which to scale the centroid coordinates from the PDF page, so that after resizing the page image to the given target width (without changing its aspect ratio), the centroids will corresponds to the object positions in this resized image. If not given, assumes no resizing will take place. (Can deal with a PDF where the pages have different sizes.) :param with_links: Also return links to the corresponding places in the originating LilyPond file. [NOT IMPLEMENTED] :returns: A triplet of per-page lists: centroids, bounding boxes, and CropObjects (MuNG data format for OMR; see the ``muscima`` package). The returned objects are dictionaries per page. The dict keys are page numbers (``int``) starting from 0, the values are numpy arrays of the shape ``(n_notes, 2)`` where the first coordinate is the **row**, the second is the **column**. The centroid dict values are ``[row, column]`` coordinates; the bounding box values are ``[top, left, bottom, right]`` lists, and the cropobjects are a list of ``CropObject`` instances (all initialized with ``clsname="notehead-full"``). If ``with_links`` is set, the CropObject ``data`` attribute has a ``lilypond_link`` to the location of the corresponding note's encoding in the lilypond file. Note that the CropObjects' ``objid`` attribute is set so that they do not collide across pages. However, CropObjects from various pages have the page number added to their ``document_name`` component of their UID. Keep this in mind if you want to merge them later. """ centroids = dict() bboxes = dict() cropobjects_per_page = dict() _current_objid = 0 # We keep the OBJID if collection_name is None: collection_name = CropObject.UID_DEFAULT_DATASET_NAMESPACE if score_name is None: score_name = CropObject.UID_DEFAULT_DOCUMENT_NAMESPACE page_no = -1 fp = open(fname, 'rb') # # ??? What was this doing here? # pages = PDFPage.get_pages(fp) # for page in pages: # parser = PDFStreamParser(page.contents[0].data) # break parser = PDFParser(fp) doc = PDFDocument(parser) pages = [p for p in PDFPage.get_pages(fp)] page_height, page_width = -1, -1 target_height = None scale = 1.0 visited = set() for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) try: obj = doc.getobj(objid) if obj is None: continue if not obj.__class__ is dict: continue # Next page if 'Annots' in obj: page_no += 1 bboxes[page_no] = [] cropobjects_per_page[page_no] = [] page = pages[page_no] page_height = int(numpy.round(page.mediabox[3])) page_width = int(numpy.round(page.mediabox[2])) if target_width is not None: scale = float(target_width) / page_width target_height = page_height * scale else: target_height = page_height if 'Rect' not in list(obj.keys()): continue bb_coords = obj['Rect'] # Rescaling to target size if target_width is not None: bb_coords = [c * scale for c in bb_coords] link_txt = obj['A']['URI'].decode() # Not a link to a note event! if link_txt.count(':') != 4: continue # Initializing the CropObject. t, l, b, r = target_height - bb_coords[3], bb_coords[0], \ target_height - bb_coords[1], bb_coords[2] # print('Bounding box: {0} from coords {1}'.format((t, l, b, r), bb_coords)) t_i, l_i, b_i, r_i = CropObject.bbox_to_integer_bounds(t, l, b, r) h_i, w_i = b_i - t_i, r_i - l_i mask = numpy.ones((h_i, w_i), dtype='uint8') uid = CropObject.build_uid(collection_name, score_name + '-P{0:02d}'.format(page_no), _current_objid) logging.debug('Creating CropObject with uid {0}'.format(uid)) data = {'ly_link': link_txt} cropobject = CropObject(objid=_current_objid, clsname='notehead-full', top=t_i, left=l_i, height=h_i, width=w_i, mask=mask, uid=uid, data=data) cropobjects_per_page[page_no].append(cropobject) _current_objid += 1 bboxes[page_no].append((t, l, b, r)) # Recompute bboxes to centroids page_bboxes = numpy.asarray(bboxes[page_no]) # For centroids, use the original float coordinates page_centroids = numpy.array([ [(b + t) / 2. - 0.5, (l + r) / 2. - 0.5] for t, l, b, r in page_bboxes]) centroids[page_no] = page_centroids except PDFObjectNotFound as e: sys.stderr.write('PDF object not found: %r' % e) return centroids, bboxes, cropobjects_per_page
def makepdf(self, pdfdata1, udct, zeros, sig_attributes): parser = PDFParser(BytesIO(pdfdata1)) document = PDFDocument(parser, fallback=False) log.info('get datas from pdf') prev = document.find_xref(parser) info = document.xrefs[0].trailer['Info'].objid root = document.xrefs[0].trailer['Root'].objid size = document.xrefs[0].trailer['Size'] page_objid = document.catalog['Pages'].objid page = None log.info('check sig attributes...') position = MyConfigLoader().get_pdf_config()['position'] if not sig_attributes: visibility = MyConfigLoader().get_pdf_config()['visibility'] else: visibility = sig_attributes['visibility'] log.info(f'the sign is {visibility}') if visibility == 'visible': position = sig_attributes['position'] log.info(f'position: {position}') page_pos = position['page'] if page_pos == 'n': try: pages_count = document.getobj(page_objid)['Count'] page = document.getobj(page_objid)['Kids'][pages_count - 1].objid except Exception: page = int(1) else: try: page = document.getobj(page_objid)['Kids'][int(page_pos) - 1].objid except Exception: log.error('page not found...take the first') page = document.getobj(page_objid)['Kids'][0].objid infodata = self.getdata(pdfdata1, info, prev, document).strip() rootdata = self.getdata(pdfdata1, root, prev, document).strip() pagedata = self.getdata(pdfdata1, page, prev, document).strip() no = size multiple_signs = False signatures = self.get_signature_names(document) if len(signatures) > 0: multiple_signs = True if visibility == 'visible': rect_array = self.get_rect_array(pagedata, position) stream_name = compress(STREAM_WITH_NAME % udct[b'name']) if multiple_signs: objs = self.make_multi_visible_sig_objs(document, udct, no, page, pagedata, infodata, rootdata, stream_name, rect_array, zeros) xref = self.make_multi_visible_xref() new_size = 11 else: objs = self.make_visible_sig_objs(udct, no, page, pagedata, infodata, rootdata, stream_name, rect_array, zeros) xref = self.make_visible_xref() new_size = 13 else: if multiple_signs: objs = self.make_multi_inv_sig_objs(document, udct, no, page, pagedata, infodata, rootdata, zeros, len(signatures) + 1) xref = self.make_multi_inv_xref() new_size = 5 else: objs = self.make_invisible_sig_objs(udct, no, page, pagedata, infodata, rootdata, zeros) xref = self.make_multi_inv_xref() new_size = 5 pdfdata2 = b''.join(objs) startxref = len(pdfdata1) dct = { b'page': page, b'no': no, b'startxref': startxref + len(pdfdata2), b'prev': prev, b'info': no + 0, b'root': no + 1, b'size': no + new_size, b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'), b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'), } for i in range(new_size): dct.update(({b'n%d' % i: startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + i)) + 1, })) trailer = b'''\ trailer <</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\ startxref\n\ %(startxref)d\n\ %%%%EOF\n\ ''' xref = xref % dct trailer = trailer % dct pdfdata2 = pdfdata2 + xref + trailer return pdfdata2
def makepdf(self, pdfdata1, udct, zeros): parser = PDFParser(BytesIO(pdfdata1)) document = PDFDocument(parser, fallback=False) prev = document.find_xref(parser) info = document.xrefs[0].trailer['Info'].objid root = document.xrefs[0].trailer['Root'].objid size = 1 # calculate last object id, size is only xref size but not count of object in xref for ref in document.xrefs: if isinstance(ref, PDFXRefStream): no = max(ref.ranges, key=operator.itemgetter(1))[1] else: if len(ref.offsets) == 0: no = 0 else: no = max(ref.offsets.keys()) size = max(size, no) page = document.getobj( document.catalog['Pages'].objid)['Kids'][0].objid nsig, fields = self.getfields(root, document) annots = self.getannots(page, document) infodata = self.getdata(pdfdata1, info, prev, document) rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', )) pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', )) annotation = udct.get(b'signature', b'').decode('utf8') x1, y1, x2, y2 = udct.get(b'signaturebox', (0, 0, 0, 0)) annotation = FreeText( Location(x1=x1, y1=y1, x2=x2, y2=y2, page=0), Appearance( fill=[0, 0, 0], stroke_width=1, wrap_text=True, font_size=12, content=annotation, ), ) pdfa = annotation.as_pdf_object(identity(), page=None) pdfar = b'[%d %d %d %d]' % tuple(pdfa.Rect) pdfas = pdfa.AP.N.stream.encode('latin1') no = size + 1 objs = [ self.makeobj(page, (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)), self.makeobj(no + 0, infodata), self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata), self.makeobj( no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' % (fields, no + 3, udct[b'sigflags'])), self.makeobj( no + 3, b''' /Type /Annot /Subtype /FreeText /AP <</N %d 0 R>> /BS <</S /S /Type /Border /W 0>> /C [] /Contents (%s) /DA (0 0 0 rg /%s 12 Tf) /Rect %s /F 704 /P %d 0 R /FT /Sig /T(Signature%d) /V %d 0 R ''' % (no + 4, pdfa.Contents.encode('latin1'), pdfa.AP.N.Resources.Font.keys()[0].encode('latin1'), pdfar, page, nsig, no + 5)), self.makeobj( no + 4, b''' /BBox %s /FormType 1 /Length %d /Matrix [1 0 0 1 0 0] /Resources <</Font <<%s <</BaseFont /Helvetica /Encoding /WinAnsiEncoding /Subtype /Type1 /Type /Font>>>> /ProcSet /PDF>> /Subtype /Form /Type /XObject ''' % ( pdfar, len(pdfas), pdfa.AP.N.Resources.Font.keys()[0].encode('latin1'), ), b'stream\n' + pdfas + b'\nendstream\n'), self.makeobj(no + 5, ( b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\ /Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\ /Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'), ] pdfdata2 = b''.join(objs) xref = b'''\ xref\n\ %(page)d 1\n\ %(p0)010d 00000 n \n\ %(no)d 6\n\ %(n0)010d 00000 n \n\ %(n1)010d 00000 n \n\ %(n2)010d 00000 n \n\ %(n3)010d 00000 n \n\ %(n4)010d 00000 n \n\ %(n5)010d 00000 n \n\ ''' startxref = len(pdfdata1) dct = { b'page': page, b'no': no, b'startxref': startxref + len(pdfdata2), b'prev': prev, b'info': no + 0, b'root': no + 1, b'size': 6, b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, b'n0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 0)) + 1, b'n1': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 1)) + 1, b'n2': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 2)) + 1, b'n3': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 3)) + 1, b'n4': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 4)) + 1, b'n5': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 5)) + 1, b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'), b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'), } trailer = b'''\ trailer <</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\ startxref\n\ %(startxref)d\n\ %%%%EOF\n\ ''' xref = xref % dct trailer = trailer % dct pdfdata2 = pdfdata2 + xref + trailer return pdfdata2
def makepdf(self, pdfdata1, udct, zeros): parser = PDFParser(BytesIO(pdfdata1)) document = PDFDocument(parser, fallback=False) prev = document.find_xref(parser) info = document.xrefs[0].trailer['Info'].objid root = document.xrefs[0].trailer['Root'].objid size = 1 # calculate last object id, size is only xref size but not count of object in xref for ref in document.xrefs: if isinstance(ref, PDFXRefStream): no = max(ref.ranges, key=operator.itemgetter(1))[1] else: if len(ref.offsets) == 0: no = 0 else: no = max(ref.offsets.keys()) size = max(size, no) pages = len(document.getobj(document.catalog['Pages'].objid)['Kids']) page = udct.get(b'sigpage', 0) if 0 <= udct.get(b'sigpage', 0) <= pages - 1 else 0 page = document.getobj( document.catalog['Pages'].objid)['Kids'][page].objid nsig, fields = self.getfields(root, document) annots = self.getannots(page, document) infodata = self.getdata(pdfdata1, info, prev, document) rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', )) pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', )) no = size + 1 visualization, nav = self.makevisualization(no, udct, nsig, page) objs = [ self.makeobj(page, (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)), self.makeobj(no + 0, infodata), self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata), self.makeobj( no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' % (fields, no + 3, udct[b'sigflags'])), visualization, self.makeobj(nav + 1, ( b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\ /Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\ /Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'), # self.makeobj(nav + 1, (b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\ #/Filter/Adobe.PPKMS/SubFilter/ETSI.CAdES.detached/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\ #/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'), ] size = nav - no + 2 pdfdata2 = b''.join(objs) startxref = len(pdfdata1) xref = b'xref\n%d 1\n%010d 00000 n \n%d %d\n' % ( page, startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, no, size) xref += b''.join([ b'%010d 00000 n \n' % (startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + i)) + 1) for i in range(size) ]) trailer = b'''\ trailer <</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\ startxref\n\ %(startxref)d\n\ %%%%EOF\n\ ''' trailer = trailer % { b'page': page, b'no': no, b'startxref': startxref + len(pdfdata2), b'prev': prev, b'info': no + 0, b'root': no + 1, b'size': size, b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'), b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'), } pdfdata2 = pdfdata2 + xref + trailer return pdfdata2
def makepdf(self, pdfdata1, udct, zeros): parser = PDFParser(BytesIO(pdfdata1)) document = PDFDocument(parser, fallback=False) prev = document.find_xref(parser) info = document.xrefs[0].trailer['Info'].objid root = document.xrefs[0].trailer['Root'].objid size = 1 # calculate last object id, size is only xref size but not count of object in xref for ref in document.xrefs: if isinstance(ref, PDFXRefStream): no = max(ref.ranges, key=operator.itemgetter(1))[1] else: if len(ref.offsets) == 0: no = 0 else: no = max(ref.offsets.keys()) size = max(size, no) page = document.getobj( document.catalog['Pages'].objid)['Kids'][0].objid nsig, fields = self.getfields(root, document) annots = self.getannots(page, document) infodata = self.getdata(pdfdata1, info, prev, document) rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', )) pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', )) rectPos = b'0 0 0 0' if b'rectPos' in udct: rectPos = udct[b'rectPos'] no = size + 1 objs = [ self.makeobj(page, (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)), self.makeobj(no + 0, infodata), self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata), self.makeobj( no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' % (fields, no + 3, udct[b'sigflags'])), self.makeobj( no + 3, b'/AP<</N %d 0 R>>/F 132/FT/Sig/P %d 0 R/Rect[%s]/Subtype/Widget/T(Signature%d)/V %d 0 R' % (no + 4, page, rectPos, nsig, no + 5)), self.makeobj( no + 4, b'/BBox[0 0 0 0]/Filter/FlateDecode/Length 8/Subtype/Form/Type/XObject', b'stream\n\x78\x9C\x03\x00\x00\x00\x00\x01\nendstream\n'), self.makeobj(no + 5, ( b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\ /Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\ /Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'), ] pdfdata2 = b''.join(objs) xref = b'''\ xref\n\ %(page)d 1\n\ %(p0)010d 00000 n \n\ %(no)d 6\n\ %(n0)010d 00000 n \n\ %(n1)010d 00000 n \n\ %(n2)010d 00000 n \n\ %(n3)010d 00000 n \n\ %(n4)010d 00000 n \n\ %(n5)010d 00000 n \n\ ''' startxref = len(pdfdata1) dct = { b'page': page, b'no': no, b'startxref': startxref + len(pdfdata2), b'prev': prev, b'info': no + 0, b'root': no + 1, b'size': 6, b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, b'n0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 0)) + 1, b'n1': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 1)) + 1, b'n2': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 2)) + 1, b'n3': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 3)) + 1, b'n4': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 4)) + 1, b'n5': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 5)) + 1, b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'), b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'), } trailer = b'''\ trailer <</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\ startxref\n\ %(startxref)d\n\ %%%%EOF\n\ ''' xref = xref % dct trailer = trailer % dct pdfdata2 = pdfdata2 + xref + trailer return pdfdata2
def makepdf(self, pdfdata1, udct, zeros): parser = PDFParser(BytesIO(pdfdata1)) document = PDFDocument(parser, fallback=False) prev = document.find_xref(parser) info = document.xrefs[0].trailer['Info'].objid root = document.xrefs[0].trailer['Root'].objid size = document.xrefs[0].trailer['Size'] page = document.getobj( document.catalog['Pages'].objid)['Kids'][0].objid infodata = self.getdata(pdfdata1, info, prev, document).strip() rootdata = self.getdata(pdfdata1, root, prev, document).strip() pagedata = self.getdata(pdfdata1, page, prev, document).strip() no = size objs = [ self.makeobj(page, (b'/Annots[%d 0 R]' % (no + 3)) + pagedata), self.makeobj(no + 0, infodata), self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata), self.makeobj( no + 2, b'/Fields[%d 0 R]/SigFlags %d' % (no + 3, udct[b'sigflags'])), self.makeobj( no + 3, b'/AP<</N %d 0 R>>/F 132/FT/Sig/P %d 0 R/Rect[0 0 0 0]/Subtype/Widget/T(Signature1)/V %d 0 R' % (no + 4, page, no + 5)), self.makeobj( no + 4, b'/BBox[0 0 0 0]/Filter/FlateDecode/Length 8/Subtype/Form/Type/XObject', b'stream\n\x78\x9C\x03\x00\x00\x00\x00\x01\nendstream\n'), self.makeobj(no + 5, ( b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\ /Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\ /Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'), ] pdfdata2 = b''.join(objs) xref = b'''\ xref\n\ %(page)d 1\n\ %(p0)010d 00000 n \n\ %(no)d 6\n\ %(n0)010d 00000 n \n\ %(n1)010d 00000 n \n\ %(n2)010d 00000 n \n\ %(n3)010d 00000 n \n\ %(n4)010d 00000 n \n\ %(n5)010d 00000 n \n\ ''' startxref = len(pdfdata1) dct = { b'page': page, b'no': no, b'startxref': startxref + len(pdfdata2), b'prev': prev, b'info': no + 0, b'root': no + 1, b'size': 6, b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, b'n0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 0)) + 1, b'n1': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 1)) + 1, b'n2': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 2)) + 1, b'n3': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 3)) + 1, b'n4': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 4)) + 1, b'n5': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 5)) + 1, b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'), b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'), } trailer = b'''\ trailer <</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\ startxref\n\ %(startxref)d\n\ %%%%EOF\n\ ''' xref = xref % dct trailer = trailer % dct pdfdata2 = pdfdata2 + xref + trailer return pdfdata2
if 'Type' in obj and obj['Type'].name == 'Page': pages.append(objid) elif 'C' in obj: # pr = obj['P'] # try: # pi = pages.index(pr.objid)+1 # except: # pi = -1 # print(objid,pi, obj['Subj'],obj['T'],obj['Contents']) if 'Contents' in obj: out = obj['Contents'].decode('latin_1').replace( '\r|\x84', ' ').strip() print(out) elif 'H' in obj: print('Found an H') fp = open("test.pdf", 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, "") visited = set() for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) try: obj = doc.getobj(objid) if obj is None: continue extract(objid, obj) except (PDFObjectNotFound): print(sys.stderr, 'not found:')