Python PDFLocConverter Exemples, pdfloc_converter.converter.PDFLocConverter Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : common_representation.py Projet : peci1/AnnotationManager

def pdfloc_to_bboxes(document, annotations):
    converter = PDFLocConverter(document.full_path, pdflocs=annotations.pdfloc_annotations)
    converter.parse_document()
    for pdfloc in annotations.pdfloc_annotations:
        annotations.bbox_annotations.append(PDFLocBoundingBoxes(
            converter.pdfloc_pair_to_bboxes(pdfloc), pdfloc.start.page, pdfloc.comment)
        )

Exemple #2

0

Afficher le fichier

Fichier : pdfloc_to_xy.py Projet : peci1/PDFLocConverter

    def execute_commandline(self, argv):
        # get rid of argv[0], since it only contains the command that was run
        args = self.parse_commandline(argv[1:])

        jobs = deque([])
        for job in args.jobs:
            jobs.append(self.parse_pdfloc_or_bounding_box_from_string(job))

        pdfloc_jobs = [job for job in jobs if isinstance(job, PDFLocPair)]
        bbox_jobs = [job for job in jobs if isinstance(job, PDFLocBoundingBoxes)]

        # if we have an input jobs file, we parse the whole document in advance
        pdflocs = pdfloc_jobs if args.jobs_file is None else []
        bboxes = bbox_jobs if args.jobs_file is None else []

        converter = PDFLocConverter(args.filename, pdflocs, bboxes)
        converter.parse_document()

        pages = converter._pdf_document.catalog['Pages'].resolve()['Kids']

        max_pdf_object_num = 0
        for xref in converter._pdf_document.xrefs:
            max_pdf_object_num = max(max_pdf_object_num, max(xref.offsets.keys()))

        pdf_update_string = u'\n'

        bboxes_result = {}

        # process all jobs, writing their results to stdout
        while True:
            # read more jobs from the input job file if specified and all command-line jobs have been processed
            if len(jobs) == 0 and args.jobs_file is not None and not args.jobs_file.closed:
                stop = False
                while not stop:
                    lines = []
                    while True:  # read lines until empty line or line containing semicolon
                        line = args.jobs_file.readline()
                        if line is None or line == "":
                            args.jobs_file.close()
                            stop = True
                            break
                        if line == "\n":
                            break
                        lines.append(line.strip())
                        if line.find(";") > -1:
                            break
                    if len(lines) > 0:
                        # if we read something, try to parse it as pdfloc or boundingbox
                        try:
                            job = ";".join(lines)
                            jobs.append(self.parse_pdfloc_or_bounding_box_from_string(job))
                            break  # let the loop process the parsed job before we read further
                        except ValueError as e:
                            print "Error parsing %s. Cause: %s" % (job, str(e))
                            break

            if len(jobs) == 0:
                # quit the loop when there are really no more jobs to be done
                break

            job = jobs.popleft()

            try:
                if isinstance(job, PDFLocPair):
                    bboxes = PDFLocBoundingBoxes(converter.pdfloc_pair_to_bboxes(job), job.start.page, job.comment)
                    if bboxes.page not in bboxes_result:
                        bboxes_result[bboxes.page] = []
                    bboxes_result[bboxes.page].append(bboxes)
                    #print "\n".join([str(bbox).strip() for bbox in bboxes.bboxes]) + "\n\n"
                else:
                    pdfloc_pair = converter.bboxes_to_pdfloc_pair(job)
                    #print str(pdfloc_pair) + "\n\n"
            except KeyError as e:
                print "Error converting %s. Cause: %s" % (job, repr(e))

        previous_startxref = 515742  # TODO
        orig_pdf_size = os.path.getsize(args.filename.name)-1
        xref_table = u"xref\n0 1\n0000000000 65535 f \n"

        for page_num in bboxes_result.keys():
            bbox_list = bboxes_result[page_num]
            page_ref = converter._pdf_document.catalog['Pages'].resolve()['Kids'][page_num-1]
            page = page_ref.resolve()

            xref_table += u"%d 1\n%010d 00000 n \n" % (page_ref.objid, len(pdf_update_string)+orig_pdf_size)

            annots_objid = max_pdf_object_num + 1
            max_pdf_object_num += 1

            exiting_annots_objids = []
            if 'Annots' in page:
                exiting_annots = page['Annots']
                exiting_annots_objids = [annot.objid for annot in page['Annots']]

            pdf_update_string += u"%i 0 obj\n<<" \
                                 u"/Type/Page" \
                                 u"/Parent %i 0 R " \
                                 u"/Resources %i 0 R" \
                                 u"/MediaBox [%i %i %i %i]" \
                                 u"/Group<</S/Transparency/CS/DeviceRGB/I true>>" \
                                 u"/Contents %i 0 R" \
                                 u"/Annots %d 0 R" \
                                 u">>\nendobj\n" % (page_ref.objid, page['Parent'].objid, page['Resources'].objid,
                                         page['MediaBox'][0], page['MediaBox'][1], page['MediaBox'][2], page['MediaBox'][3],
                                         page['Contents'].objid, annots_objid)

            annots = range(max_pdf_object_num + 1, max_pdf_object_num + 1 + len(bbox_list))
            max_pdf_object_num += len(bbox_list)

            xref_table += u"%d 1\n%010d 00000 n \n" % (annots_objid, len(pdf_update_string) + orig_pdf_size)

            items_refs = u" 0 R ".join(str(objid) for objid in exiting_annots_objids + annots) + u" 0 R"
            pdf_update_string += u"%d 0 obj [%s] endobj\n" % (annots_objid, items_refs)

            i = 0
            for annotation_bboxes in bbox_list:
                objid = annots[i]

                xref_table += u"%d 1\n%010d 00000 n \n" % (objid, len(pdf_update_string) + orig_pdf_size)

                comment = annotation_bboxes.comment.decode('utf-8').replace(u")", u"\\)")
                first_box = annotation_bboxes.bboxes[0].bbox
                last_box = annotation_bboxes.bboxes[-1].bbox

                pdf_update_string += u"%d 0 obj\n<<" \
                                     u"/Subtype /Highlight" \
                                     u"/P %d 0 R" \
                                     u"/C [1 1 0]" \
                                     u"/F 4" \
                                     u"/Contents (%s)" \
                                     u"/Rect [%d %d %d %d] " \
                                     u"/QuadPoints [" % (
                    objid, page_ref.objid, comment,
                    min(first_box[0], last_box[0]), min(first_box[1], last_box[1]),
                    max(first_box[2], last_box[2]), max(first_box[3], last_box[3])
                )

                j = 0
                for bbox in annotation_bboxes.bboxes:
                    bbox = bbox.bbox

                    top = min(bbox[1], bbox[3])
                    bottom = max (bbox[1], bbox[3])
                    left = min(bbox[0], bbox[2])
                    right = max(bbox[0], bbox[2])
                    if top - bottom > 20:
                        top = bottom + 20

                    pdf_update_string += u"%d %d %d %d %d %d %d %d " % (
                        left, bottom, right, bottom, left, top, right, top
                    )

                    j += 1

                pdf_update_string += u"]>>\nendobj\n"
                i += 1

        pdf_update_string += u"\n"

        xref_position = orig_pdf_size + len(pdf_update_string)

        pdf_update_string = pdf_update_string + xref_table + u"\n" \
                                                             u"trailer\n<<\n/Size %d /Root %d 0 R /Prev %d\n>>\n" % \
                                                             (max_pdf_object_num+1, converter._pdf_document.xrefs[0].trailer['Root'].objid, previous_startxref)

        pdf_update_string += u"startxref\n%d\n%%%%EOF" % xref_position
        print pdf_update_string

        return 0

Exemple #3

0

Afficher le fichier

Fichier : common_representation.py Projet : peci1/AnnotationManager

def bboxes_to_pdfloc(document, annotations):
    converter = PDFLocConverter(document.full_path, bboxes=annotations.bbox_annotations)
    converter.parse_document()
    for bbox in annotations.bbox_annotations:
        annotations.pdfloc_annotations.append(converter.bboxes_to_pdfloc_pair(bbox))