Ejemplo n.º 1
0
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
Ejemplo n.º 2
0
 def parse (self):
     fp = file(self.pdf, 'rb')
     parser = PDFParser(fp, dbg=self.debug)
     doc = PDFDocument(parser, dbg=self.debug)
     #extract blob of data after EOF (if it exists)
     if doc.found_eof and doc.eof_distance > 3:
         self.bin_blob = parser.read_from_end(doc.eof_distance)
     res = '<pdf>'
     visited = set() #keep track of the objects already visited
     for xref in doc.xrefs:
         for objid in xref.get_objids():
             if objid in visited:
                 continue
             if objid == 21 or objid == 67:
                 print objid
             visited.add(objid)
             try:
                 obj = doc.getobj(objid)
                 res += '<object id="' + str(objid) + '">\n'
                 res += self.dump(obj)
                 res += '\n</object>\n\n'
             except PDFObjectNotFound as e:
                 mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
                 mal_obj = mal_obj.replace('<', '0x3C')
                 res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
                 self.takenote(self.malformed, 'objects', objid)
             except Exception as e:
                 res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
     fp.close()
     res += self.dumptrailers(doc)
     res += '</pdf>'
     self.xml=res
     self.errors = doc.errors
     self.bytes_read = parser.BYTES
     return
Ejemplo n.º 3
0
def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    fp.close()
    return
Ejemplo n.º 4
0
def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    return
Ejemplo n.º 5
0
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
Ejemplo n.º 6
0
def dumppdf(fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    res = ""
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            res += dumpxml(obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        res += dumpxml( obj, codec=codec)
                else:
                    res += dumpxml(page.attrs)
    #print "before dumpall"
    if dumpall:
        res += dumpallobjs( doc, codec=codec)
        #print "after dumpall"
    if (not objids) and (not pagenos) and (not dumpall):
        res += dumptrailers( doc)
    fp.close()
    if codec not in ('raw','binary'):
        res += '\n'
    #print "end proc"
    return res
Ejemplo n.º 7
0
def print_all_obj(filename):
    with file(filename, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser, None)
        visited_objids = set()
        for xref in doc.xrefs:
            for objid in xref.get_objids():
                if objid in visited_objids:
                    continue
                visited_objids.add(objid)
                print objid, get_obj_type(doc.getobj(objid))
Ejemplo n.º 8
0
def extractembedded(
    outfp,
    fname,
    objids,
    pagenos,
    password="",
    dumpall=False,
    codec=None,
    extractdir=None,
):
    def extract1(objid, obj):
        filename = os.path.basename(obj.get("UF") or obj.get("F").decode())
        fileref = obj["EF"].get("UF") or obj["EF"].get("F")
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            error_msg = (
                "unable to process PDF: reference for %r is not a "
                "PDFStream" % filename
            )
            raise PDFValueError(error_msg)
        if fileobj.get("Type") is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                "unable to process PDF: reference for %r "
                "is not an EmbeddedFile" % (filename)
            )
        path = os.path.join(extractdir, "%.6d-%s" % (objid, filename))
        if os.path.exists(path):
            raise IOError("file exists: %r" % path)
        print("extracting: %r" % path)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        out = open(path, "wb")
        out.write(fileobj.get_data())
        out.close()
        return

    with open(fname, "rb") as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser, password)
        extracted_objids = set()
        for xref in doc.xrefs:
            for objid in xref.get_objids():
                obj = doc.getobj(objid)
                if (
                    objid not in extracted_objids
                    and isinstance(obj, dict)
                    and obj.get("Type") is LITERAL_FILESPEC
                ):
                    extracted_objids.add(objid)
                    extract1(objid, obj)
    return
Ejemplo n.º 9
0
    def _run(self, scanObject, result, depth, args):
        moduleResult = []

        pdfBuffer = cStringIO.StringIO(scanObject.buffer)

        try:

            pdfFile = PdfFileReader(pdfBuffer)

            docInfo = pdfFile.getDocumentInfo()
            for metaItem in docInfo:
                scanObject.addMetadata(self.module_name, metaItem[1:],
                                       str(docInfo[metaItem]))

            pdf = PDFParser(pdfBuffer)
            pdfDoc = PDFDocument(pdf)

            for xref in pdfDoc.xrefs:
                for objid in xref.get_objids():
                    try:
                        obj = pdfDoc.getobj(objid)
                        if isinstance(obj, dict):
                            for (key, val) in obj.iteritems():
                                if key in ['AA', 'OpenAction']:
                                    scanObject.addFlag('pdf:nfo:auto_action')
                                elif key in ['JS', 'Javascript']:
                                    scanObject.addFlag('pdf:nfo:js_embedded')
                        if isinstance(obj, PDFStream):
                            if 'Type' in obj.attrs and obj.attrs[
                                    'Type'] == LIT('EmbeddedFile'):
                                moduleResult.append(
                                    ModuleObject(
                                        buffer=obj.get_data(),
                                        externalVars=ExternalVars(
                                            filename='e_pdf_stream_%s' %
                                            objid)))

                    except PDFObjectNotFound:
                        scanObject.addFlag('pdf:err:missing_object_%s' % objid)
                    except ScanError:
                        raise

        except PSEOF:
            scanObject.addFlag('pdf:err:unexpected_eof')
        except ScanError:
            raise

        return moduleResult
Ejemplo n.º 10
0
    def extract(self):
        with open(self._pdfname, 'rb') as f:
            parser = PDFParser(f)
            document = PDFDocument(parser, '')
            visited = set()

            for xref in document.xrefs:
                for objid in xref.get_objids():
                    if objid in visited:
                        continue
                    visited.add(objid)
                    try:
                        obj = document.getobj(objid)
                        if not isinstance(obj, dict):
                            continue
                        self._extract(objid, obj)
                    except PDFObjectNotFound as e:
                        pass

        return self.comments
Ejemplo n.º 11
0
def extractembedded(outfp,
                    fname,
                    objids,
                    pagenos,
                    password='',
                    dumpall=False,
                    codec=None,
                    extractdir=None):
    def extract1(objid, obj):
        filename = os.path.basename(obj.get('UF') or obj.get('F').decode())
        fileref = obj['EF'].get('UF') or obj['EF'].get('F')
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            error_msg = 'unable to process PDF: reference for %r is not a ' \
                        'PDFStream' % filename
            raise PDFValueError(error_msg)
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError('unable to process PDF: reference for %r '
                                'is not an EmbeddedFile' % (filename))
        path = os.path.join(extractdir, '%.6d-%s' % (objid, filename))
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print('extracting: %r' % path)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        out = open(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    with open(fname, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser, password)
        extracted_objids = set()
        for xref in doc.xrefs:
            for objid in xref.get_objids():
                obj = doc.getobj(objid)
                if objid not in extracted_objids and isinstance(obj, dict) \
                        and obj.get('Type') is LITERAL_FILESPEC:
                    extracted_objids.add(objid)
                    extract1(objid, obj)
    return
Ejemplo n.º 12
0
def extractComments(fp):
    parser = PDFParser(fp)
    doc = PDFDocument(parser, "")

    visited = set()
    pages = []
    resultList = []

    def extract(objid, obj):
        result = None
        if isinstance(obj, dict):
            # 'Type' is PDFObjRef type
            if obj.has_key('Type') and obj['Type'].name == 'Page':
                pages.append(objid)
            elif obj.has_key('C'):
                try:
                    pr = obj['P']
                    pi = pages.index(pr.objid)+1
                except:
                    pi = -1
                try:
                    result = (fp.name, objid, pi, obj['Subtype'].name, obj['Subj'],obj['T'],obj['Contents'])
                except:
                    # if any of the listed entries do not exist, ignore 
                    #print(objid, pi, obj['Subtype'].name)
                    result = ()

        return result

    for xref in doc.xrefs:
        for objid in xref.get_objids():
            if objid in visited: continue
            visited.add(objid)
            try:
                obj = doc.getobj(objid)
                if obj is None: continue
                r= extract(objid,obj)
                if r:
                    resultList.append(r)
            except PDFObjectNotFound, e:
                print >>sys.stderr, 'not found: %r' % e
Ejemplo n.º 13
0
def xmlFromPdf(pdfpath, xmlpath=None):
    with open(pdfpath, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        all_objids = set(objid for xref in doc.xrefs
                         for objid in xref.get_objids())
        for objid in all_objids:
            obj = doc.getobj(objid)
            if not isinstance(obj, PDFStream):
                continue
            data = obj.get_data()
            if b'xfa-template' in data:
                break
        else:
            msg = 'Cannot find form data in %s' % pdfpath
            raise CrypticXml(msg)
    # data == <form>-text.xml
    tree = etree.fromstring(data)
    if xmlpath is not None:
        with open(xmlpath, 'wb') as out:
            out.write(etree.tostring(tree, pretty_print=True))
    return tree
Ejemplo n.º 14
0
def dumpallobjs(out: TextIO,
                doc: PDFDocument,
                codec: Optional[str] = None,
                show_fallback_xref: bool = False) -> None:
    visited = set()
    out.write('<pdf>')
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            if objid in visited:
                continue
            visited.add(objid)
            try:
                obj = doc.getobj(objid)
                if obj is None:
                    continue
                out.write('<object id="%d">\n' % objid)
                dumpxml(out, obj, codec=codec)
                out.write('\n</object>\n\n')
            except PDFObjectNotFound as e:
                print('not found: %r' % e)
    dumptrailers(out, doc, show_fallback_xref)
    out.write('</pdf>')
    return
Ejemplo n.º 15
0
def parse_pdf(fname, target_width=None, with_links=False,
              collection_name=None, score_name=None):
    """Extracts the notehead centroid coordinates from the given
    LilyPond-generated PDF file, notehead bounding box coordinates,
    and builds the corresponding CropObjects.

    :param fname: Name of the input PDF file. Assumes it was generated
        from LilyPond with the option

        ``-e"(set-option 'point-and-click '(note-event))"``.

    :param target_width: The width of an image against which
        we want the centroids to be valid. Based on the PDF page
        size(s), the function will compute a ratio by which to scale
        the centroid coordinates from the PDF page, so that after resizing
        the page image to the given target width (without changing its
        aspect ratio), the centroids will corresponds to the object
        positions in this resized image. If not given, assumes no resizing
        will take place. (Can deal with a PDF where the pages have different
        sizes.)

    :param with_links: Also return links to the corresponding places
        in the originating LilyPond file. [NOT IMPLEMENTED]

    :returns: A triplet of per-page lists: centroids, bounding boxes,
        and CropObjects (MuNG data format for OMR; see the ``muscima``
        package).

        The returned objects are dictionaries per page.
        The dict keys are page numbers (``int``) starting from 0,
        the values are numpy arrays of the shape ``(n_notes, 2)``
        where the first coordinate is the **row**,
        the second is the **column**. The centroid dict values are
        ``[row, column]`` coordinates; the bounding box values are
        ``[top, left, bottom, right]`` lists, and the cropobjects
        are a list of ``CropObject`` instances (all initialized
        with ``clsname="notehead-full"``).

        If ``with_links`` is set, the CropObject ``data`` attribute
        has a ``lilypond_link`` to the location of the corresponding
        note's encoding in the lilypond file.

        Note that the CropObjects' ``objid`` attribute is set
        so that they do not collide across pages. However, CropObjects
        from various pages have the page number added to their
        ``document_name`` component of their UID. Keep this in mind
        if you want to merge them later.

    """
    centroids = dict()
    bboxes = dict()

    cropobjects_per_page = dict()
    _current_objid = 0  # We keep the OBJID
    if collection_name is None:
        collection_name = CropObject.UID_DEFAULT_DATASET_NAMESPACE
    if score_name is None:
        score_name = CropObject.UID_DEFAULT_DOCUMENT_NAMESPACE

    page_no = -1
    fp = open(fname, 'rb')

    # # ??? What was this doing here?
    # pages = PDFPage.get_pages(fp)
    # for page in pages:
    #     parser = PDFStreamParser(page.contents[0].data)
    #     break

    parser = PDFParser(fp)
    doc = PDFDocument(parser)

    pages = [p for p in PDFPage.get_pages(fp)]
    page_height, page_width = -1, -1
    target_height = None
    scale = 1.0

    visited = set()

    for xref in doc.xrefs:
        for objid in xref.get_objids():

            if objid in visited: continue
            visited.add(objid)

            try:
                obj = doc.getobj(objid)
                if obj is None:
                    continue

                if not obj.__class__ is dict:
                    continue

                # Next page
                if 'Annots' in obj:
                    page_no += 1
                    bboxes[page_no] = []
                    cropobjects_per_page[page_no] = []

                    page = pages[page_no]
                    page_height = int(numpy.round(page.mediabox[3]))
                    page_width = int(numpy.round(page.mediabox[2]))

                    if target_width is not None:
                        scale = float(target_width) / page_width
                        target_height = page_height * scale
                    else:
                        target_height = page_height

                if 'Rect' not in list(obj.keys()):
                    continue

                bb_coords = obj['Rect']

                # Rescaling to target size
                if target_width is not None:
                    bb_coords = [c * scale for c in bb_coords]

                link_txt = obj['A']['URI'].decode()

                # Not a link to a note event!
                if link_txt.count(':') != 4:
                    continue

                # Initializing the CropObject.
                t, l, b, r = target_height - bb_coords[3], bb_coords[0], \
                             target_height - bb_coords[1], bb_coords[2]
                # print('Bounding box: {0} from coords {1}'.format((t, l, b, r), bb_coords))
                t_i, l_i, b_i, r_i = CropObject.bbox_to_integer_bounds(t, l, b, r)
                h_i, w_i = b_i - t_i, r_i - l_i
                mask = numpy.ones((h_i, w_i), dtype='uint8')

                uid = CropObject.build_uid(collection_name,
                                           score_name + '-P{0:02d}'.format(page_no),
                                           _current_objid)
                logging.debug('Creating CropObject with uid {0}'.format(uid))
                data = {'ly_link': link_txt}
                cropobject = CropObject(objid=_current_objid,
                                        clsname='notehead-full',
                                        top=t_i, left=l_i, height=h_i, width=w_i,
                                        mask=mask,
                                        uid=uid,
                                        data=data)
                cropobjects_per_page[page_no].append(cropobject)
                _current_objid += 1

                bboxes[page_no].append((t, l, b, r))

                # Recompute bboxes to centroids
                page_bboxes = numpy.asarray(bboxes[page_no])
                # For centroids, use the original float coordinates
                page_centroids = numpy.array([
                                 [(b + t) / 2. - 0.5,
                                  (l + r) / 2. - 0.5]
                                 for t, l, b, r in page_bboxes])
                centroids[page_no] = page_centroids

            except PDFObjectNotFound as e:
                sys.stderr.write('PDF object not found: %r' % e)

    return centroids, bboxes, cropobjects_per_page
Ejemplo n.º 16
0
    def makepdf(self, pdfdata1, udct, zeros, sig_attributes):
        parser = PDFParser(BytesIO(pdfdata1))
        document = PDFDocument(parser, fallback=False)
        log.info('get datas from pdf')
        prev = document.find_xref(parser)
        info = document.xrefs[0].trailer['Info'].objid
        root = document.xrefs[0].trailer['Root'].objid
        size = document.xrefs[0].trailer['Size']
        page_objid = document.catalog['Pages'].objid
        page = None

        log.info('check sig attributes...')
        position = MyConfigLoader().get_pdf_config()['position']
        if not sig_attributes:
            visibility = MyConfigLoader().get_pdf_config()['visibility']
        else:
            visibility = sig_attributes['visibility']
            log.info(f'the sign is {visibility}')
            if visibility == 'visible':
                position = sig_attributes['position']
                log.info(f'position: {position}')

        page_pos = position['page']
        if page_pos == 'n':
            try:
                pages_count = document.getobj(page_objid)['Count']
                page = document.getobj(page_objid)['Kids'][pages_count - 1].objid
            except Exception:
                page = int(1)
        else:
            try:
                page = document.getobj(page_objid)['Kids'][int(page_pos) - 1].objid
            except Exception:
                log.error('page not found...take the first')
                page = document.getobj(page_objid)['Kids'][0].objid

        infodata = self.getdata(pdfdata1, info, prev, document).strip()
        rootdata = self.getdata(pdfdata1, root, prev, document).strip()
        pagedata = self.getdata(pdfdata1, page, prev, document).strip()

        no = size
        multiple_signs = False
        signatures = self.get_signature_names(document)
        if len(signatures) > 0:
            multiple_signs = True

        if visibility == 'visible':
            rect_array = self.get_rect_array(pagedata, position)
            stream_name = compress(STREAM_WITH_NAME % udct[b'name'])
            if multiple_signs:
                objs = self.make_multi_visible_sig_objs(document, udct, no, page, pagedata, infodata, rootdata, stream_name, rect_array, zeros)
                xref = self.make_multi_visible_xref()
                new_size = 11
            else:
                objs = self.make_visible_sig_objs(udct, no, page, pagedata, infodata, rootdata, stream_name, rect_array, zeros)
                xref = self.make_visible_xref()
                new_size = 13
        else:
            if multiple_signs:
                objs = self.make_multi_inv_sig_objs(document, udct, no, page, pagedata, infodata, rootdata, zeros, len(signatures) + 1)
                xref = self.make_multi_inv_xref()
                new_size = 5
            else:
                objs = self.make_invisible_sig_objs(udct, no, page, pagedata, infodata, rootdata, zeros)
                xref = self.make_multi_inv_xref()
                new_size = 5

        pdfdata2 = b''.join(objs)
        startxref = len(pdfdata1)
        dct = {
            b'page': page,
            b'no': no,
            b'startxref': startxref + len(pdfdata2),
            b'prev': prev,
            b'info': no + 0,
            b'root': no + 1,
            b'size': no + new_size,
            b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1,
            b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'),
            b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'),
        }
        for i in range(new_size):
            dct.update(({b'n%d' % i: startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + i)) + 1, }))

        trailer = b'''\
trailer
<</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\
startxref\n\
%(startxref)d\n\
%%%%EOF\n\
'''

        xref = xref % dct
        trailer = trailer % dct

        pdfdata2 = pdfdata2 + xref + trailer

        return pdfdata2
Ejemplo n.º 17
0
    def makepdf(self, pdfdata1, udct, zeros):
        parser = PDFParser(BytesIO(pdfdata1))
        document = PDFDocument(parser, fallback=False)

        prev = document.find_xref(parser)
        info = document.xrefs[0].trailer['Info'].objid
        root = document.xrefs[0].trailer['Root'].objid
        size = 1
        # calculate last object id, size is only xref size but not count of object in xref
        for ref in document.xrefs:
            if isinstance(ref, PDFXRefStream):
                no = max(ref.ranges, key=operator.itemgetter(1))[1]
            else:
                if len(ref.offsets) == 0:
                    no = 0
                else:
                    no = max(ref.offsets.keys())
            size = max(size, no)
        page = document.getobj(
            document.catalog['Pages'].objid)['Kids'][0].objid

        nsig, fields = self.getfields(root, document)
        annots = self.getannots(page, document)

        infodata = self.getdata(pdfdata1, info, prev, document)
        rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', ))
        pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', ))

        annotation = udct.get(b'signature', b'').decode('utf8')
        x1, y1, x2, y2 = udct.get(b'signaturebox', (0, 0, 0, 0))
        annotation = FreeText(
            Location(x1=x1, y1=y1, x2=x2, y2=y2, page=0),
            Appearance(
                fill=[0, 0, 0],
                stroke_width=1,
                wrap_text=True,
                font_size=12,
                content=annotation,
            ),
        )
        pdfa = annotation.as_pdf_object(identity(), page=None)
        pdfar = b'[%d %d %d %d]' % tuple(pdfa.Rect)
        pdfas = pdfa.AP.N.stream.encode('latin1')

        no = size + 1
        objs = [
            self.makeobj(page,
                         (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)),
            self.makeobj(no + 0, infodata),
            self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata),
            self.makeobj(
                no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' %
                (fields, no + 3, udct[b'sigflags'])),
            self.makeobj(
                no + 3, b'''
/Type
/Annot
/Subtype
/FreeText
/AP <</N %d 0 R>>
/BS <</S /S /Type /Border /W 0>>
/C []
/Contents (%s)
/DA (0 0 0 rg /%s 12 Tf)
/Rect %s
/F 704
/P %d 0 R
/FT
/Sig
/T(Signature%d)
/V %d 0 R
''' % (no + 4, pdfa.Contents.encode('latin1'),
        pdfa.AP.N.Resources.Font.keys()[0].encode('latin1'), pdfar, page, nsig,
        no + 5)),
            self.makeobj(
                no + 4, b'''
/BBox %s
/FormType 1
/Length %d
/Matrix [1 0 0 1 0 0]
/Resources <</Font <<%s <</BaseFont /Helvetica /Encoding /WinAnsiEncoding /Subtype /Type1 /Type /Font>>>> /ProcSet /PDF>>
/Subtype
/Form
/Type
/XObject
''' % (
                    pdfar,
                    len(pdfas),
                    pdfa.AP.N.Resources.Font.keys()[0].encode('latin1'),
                ), b'stream\n' + pdfas + b'\nendstream\n'),
            self.makeobj(no + 5, (
                b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\
/Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\
/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'],
                udct[b'reason'])) + zeros + b'>'),
        ]

        pdfdata2 = b''.join(objs)
        xref = b'''\
xref\n\
%(page)d 1\n\
%(p0)010d 00000 n \n\
%(no)d 6\n\
%(n0)010d 00000 n \n\
%(n1)010d 00000 n \n\
%(n2)010d 00000 n \n\
%(n3)010d 00000 n \n\
%(n4)010d 00000 n \n\
%(n5)010d 00000 n \n\
'''
        startxref = len(pdfdata1)
        dct = {
            b'page': page,
            b'no': no,
            b'startxref': startxref + len(pdfdata2),
            b'prev': prev,
            b'info': no + 0,
            b'root': no + 1,
            b'size': 6,
            b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1,
            b'n0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 0)) + 1,
            b'n1': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 1)) + 1,
            b'n2': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 2)) + 1,
            b'n3': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 3)) + 1,
            b'n4': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 4)) + 1,
            b'n5': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 5)) + 1,
            b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'),
            b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'),
        }

        trailer = b'''\
trailer
<</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\
startxref\n\
%(startxref)d\n\
%%%%EOF\n\
'''

        xref = xref % dct
        trailer = trailer % dct

        pdfdata2 = pdfdata2 + xref + trailer

        return pdfdata2
Ejemplo n.º 18
0
    def makepdf(self, pdfdata1, udct, zeros):
        parser = PDFParser(BytesIO(pdfdata1))
        document = PDFDocument(parser, fallback=False)

        prev = document.find_xref(parser)
        info = document.xrefs[0].trailer['Info'].objid
        root = document.xrefs[0].trailer['Root'].objid
        size = 1
        # calculate last object id, size is only xref size but not count of object in xref
        for ref in document.xrefs:
            if isinstance(ref, PDFXRefStream):
                no = max(ref.ranges, key=operator.itemgetter(1))[1]
            else:
                if len(ref.offsets) == 0:
                    no = 0
                else:
                    no = max(ref.offsets.keys())
            size = max(size, no)
        pages = len(document.getobj(document.catalog['Pages'].objid)['Kids'])
        page = udct.get(b'sigpage',
                        0) if 0 <= udct.get(b'sigpage', 0) <= pages - 1 else 0
        page = document.getobj(
            document.catalog['Pages'].objid)['Kids'][page].objid

        nsig, fields = self.getfields(root, document)
        annots = self.getannots(page, document)

        infodata = self.getdata(pdfdata1, info, prev, document)
        rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', ))
        pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', ))

        no = size + 1
        visualization, nav = self.makevisualization(no, udct, nsig, page)
        objs = [
            self.makeobj(page,
                         (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)),
            self.makeobj(no + 0, infodata),
            self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata),
            self.makeobj(
                no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' %
                (fields, no + 3, udct[b'sigflags'])),
            visualization,
            self.makeobj(nav + 1, (
                b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\
/Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\
/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'],
                udct[b'reason'])) + zeros + b'>'),
            #            self.makeobj(nav + 1, (b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\
            #/Filter/Adobe.PPKMS/SubFilter/ETSI.CAdES.detached/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\
            #/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'),
        ]

        size = nav - no + 2
        pdfdata2 = b''.join(objs)
        startxref = len(pdfdata1)
        xref = b'xref\n%d 1\n%010d 00000 n \n%d %d\n' % (
            page, startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, no,
            size)
        xref += b''.join([
            b'%010d 00000 n \n' % (startxref + pdfdata2.find(b'\n%d 0 obj\n' %
                                                             (no + i)) + 1)
            for i in range(size)
        ])

        trailer = b'''\
trailer
<</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\
startxref\n\
%(startxref)d\n\
%%%%EOF\n\
'''
        trailer = trailer % {
            b'page': page,
            b'no': no,
            b'startxref': startxref + len(pdfdata2),
            b'prev': prev,
            b'info': no + 0,
            b'root': no + 1,
            b'size': size,
            b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'),
            b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'),
        }

        pdfdata2 = pdfdata2 + xref + trailer

        return pdfdata2
Ejemplo n.º 19
0
    def makepdf(self, pdfdata1, udct, zeros):
        parser = PDFParser(BytesIO(pdfdata1))
        document = PDFDocument(parser, fallback=False)

        prev = document.find_xref(parser)
        info = document.xrefs[0].trailer['Info'].objid
        root = document.xrefs[0].trailer['Root'].objid
        size = 1
        # calculate last object id, size is only xref size but not count of object in xref
        for ref in document.xrefs:
            if isinstance(ref, PDFXRefStream):
                no = max(ref.ranges, key=operator.itemgetter(1))[1]
            else:
                if len(ref.offsets) == 0:
                    no = 0
                else:
                    no = max(ref.offsets.keys())
            size = max(size, no)
        page = document.getobj(
            document.catalog['Pages'].objid)['Kids'][0].objid

        nsig, fields = self.getfields(root, document)
        annots = self.getannots(page, document)

        infodata = self.getdata(pdfdata1, info, prev, document)
        rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', ))
        pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', ))

        rectPos = b'0 0 0 0'

        if b'rectPos' in udct:
            rectPos = udct[b'rectPos']

        no = size + 1
        objs = [
            self.makeobj(page,
                         (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)),
            self.makeobj(no + 0, infodata),
            self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata),
            self.makeobj(
                no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' %
                (fields, no + 3, udct[b'sigflags'])),
            self.makeobj(
                no + 3,
                b'/AP<</N %d 0 R>>/F 132/FT/Sig/P %d 0 R/Rect[%s]/Subtype/Widget/T(Signature%d)/V %d 0 R'
                % (no + 4, page, rectPos, nsig, no + 5)),
            self.makeobj(
                no + 4,
                b'/BBox[0 0 0 0]/Filter/FlateDecode/Length 8/Subtype/Form/Type/XObject',
                b'stream\n\x78\x9C\x03\x00\x00\x00\x00\x01\nendstream\n'),
            self.makeobj(no + 5, (
                b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\
/Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\
/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'],
                udct[b'reason'])) + zeros + b'>'),
        ]

        pdfdata2 = b''.join(objs)
        xref = b'''\
xref\n\
%(page)d 1\n\
%(p0)010d 00000 n \n\
%(no)d 6\n\
%(n0)010d 00000 n \n\
%(n1)010d 00000 n \n\
%(n2)010d 00000 n \n\
%(n3)010d 00000 n \n\
%(n4)010d 00000 n \n\
%(n5)010d 00000 n \n\
'''
        startxref = len(pdfdata1)
        dct = {
            b'page': page,
            b'no': no,
            b'startxref': startxref + len(pdfdata2),
            b'prev': prev,
            b'info': no + 0,
            b'root': no + 1,
            b'size': 6,
            b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1,
            b'n0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 0)) + 1,
            b'n1': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 1)) + 1,
            b'n2': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 2)) + 1,
            b'n3': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 3)) + 1,
            b'n4': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 4)) + 1,
            b'n5': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 5)) + 1,
            b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'),
            b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'),
        }

        trailer = b'''\
trailer
<</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\
startxref\n\
%(startxref)d\n\
%%%%EOF\n\
'''

        xref = xref % dct
        trailer = trailer % dct

        pdfdata2 = pdfdata2 + xref + trailer

        return pdfdata2
Ejemplo n.º 20
0
    def makepdf(self, pdfdata1, udct, zeros):
        parser = PDFParser(BytesIO(pdfdata1))
        document = PDFDocument(parser, fallback=False)

        prev = document.find_xref(parser)
        info = document.xrefs[0].trailer['Info'].objid
        root = document.xrefs[0].trailer['Root'].objid
        size = document.xrefs[0].trailer['Size']
        page = document.getobj(
            document.catalog['Pages'].objid)['Kids'][0].objid

        infodata = self.getdata(pdfdata1, info, prev, document).strip()
        rootdata = self.getdata(pdfdata1, root, prev, document).strip()
        pagedata = self.getdata(pdfdata1, page, prev, document).strip()

        no = size
        objs = [
            self.makeobj(page, (b'/Annots[%d 0 R]' % (no + 3)) + pagedata),
            self.makeobj(no + 0, infodata),
            self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata),
            self.makeobj(
                no + 2,
                b'/Fields[%d 0 R]/SigFlags %d' % (no + 3, udct[b'sigflags'])),
            self.makeobj(
                no + 3,
                b'/AP<</N %d 0 R>>/F 132/FT/Sig/P %d 0 R/Rect[0 0 0 0]/Subtype/Widget/T(Signature1)/V %d 0 R'
                % (no + 4, page, no + 5)),
            self.makeobj(
                no + 4,
                b'/BBox[0 0 0 0]/Filter/FlateDecode/Length 8/Subtype/Form/Type/XObject',
                b'stream\n\x78\x9C\x03\x00\x00\x00\x00\x01\nendstream\n'),
            self.makeobj(no + 5, (
                b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\
/Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\
/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'],
                udct[b'reason'])) + zeros + b'>'),
        ]

        pdfdata2 = b''.join(objs)
        xref = b'''\
xref\n\
%(page)d 1\n\
%(p0)010d 00000 n \n\
%(no)d 6\n\
%(n0)010d 00000 n \n\
%(n1)010d 00000 n \n\
%(n2)010d 00000 n \n\
%(n3)010d 00000 n \n\
%(n4)010d 00000 n \n\
%(n5)010d 00000 n \n\
'''
        startxref = len(pdfdata1)
        dct = {
            b'page': page,
            b'no': no,
            b'startxref': startxref + len(pdfdata2),
            b'prev': prev,
            b'info': no + 0,
            b'root': no + 1,
            b'size': 6,
            b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1,
            b'n0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 0)) + 1,
            b'n1': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 1)) + 1,
            b'n2': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 2)) + 1,
            b'n3': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 3)) + 1,
            b'n4': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 4)) + 1,
            b'n5': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 5)) + 1,
            b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'),
            b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'),
        }

        trailer = b'''\
trailer
<</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\
startxref\n\
%(startxref)d\n\
%%%%EOF\n\
'''

        xref = xref % dct
        trailer = trailer % dct

        pdfdata2 = pdfdata2 + xref + trailer

        return pdfdata2
Ejemplo n.º 21
0
        if 'Type' in obj and obj['Type'].name == 'Page':
            pages.append(objid)
        elif 'C' in obj:
            # pr = obj['P']
            # try:
            #     pi = pages.index(pr.objid)+1
            # except:
            #     pi = -1
            # print(objid,pi, obj['Subj'],obj['T'],obj['Contents'])
            if 'Contents' in obj:
                out = obj['Contents'].decode('latin_1').replace(
                    '\r|\x84', ' ').strip()
                print(out)
        elif 'H' in obj:
            print('Found an H')


fp = open("test.pdf", 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, "")
visited = set()
for xref in doc.xrefs:
    for objid in xref.get_objids():
        if objid in visited: continue
        visited.add(objid)
        try:
            obj = doc.getobj(objid)
            if obj is None: continue
            extract(objid, obj)
        except (PDFObjectNotFound):
            print(sys.stderr, 'not found:')