def pdfloc_to_bboxes(document, annotations): converter = PDFLocConverter(document.full_path, pdflocs=annotations.pdfloc_annotations) converter.parse_document() for pdfloc in annotations.pdfloc_annotations: annotations.bbox_annotations.append(PDFLocBoundingBoxes( converter.pdfloc_pair_to_bboxes(pdfloc), pdfloc.start.page, pdfloc.comment) )
def execute_commandline(self, argv): # get rid of argv[0], since it only contains the command that was run args = self.parse_commandline(argv[1:]) jobs = deque([]) for job in args.jobs: jobs.append(self.parse_pdfloc_or_bounding_box_from_string(job)) pdfloc_jobs = [job for job in jobs if isinstance(job, PDFLocPair)] bbox_jobs = [job for job in jobs if isinstance(job, PDFLocBoundingBoxes)] # if we have an input jobs file, we parse the whole document in advance pdflocs = pdfloc_jobs if args.jobs_file is None else [] bboxes = bbox_jobs if args.jobs_file is None else [] converter = PDFLocConverter(args.filename, pdflocs, bboxes) converter.parse_document() pages = converter._pdf_document.catalog['Pages'].resolve()['Kids'] max_pdf_object_num = 0 for xref in converter._pdf_document.xrefs: max_pdf_object_num = max(max_pdf_object_num, max(xref.offsets.keys())) pdf_update_string = u'\n' bboxes_result = {} # process all jobs, writing their results to stdout while True: # read more jobs from the input job file if specified and all command-line jobs have been processed if len(jobs) == 0 and args.jobs_file is not None and not args.jobs_file.closed: stop = False while not stop: lines = [] while True: # read lines until empty line or line containing semicolon line = args.jobs_file.readline() if line is None or line == "": args.jobs_file.close() stop = True break if line == "\n": break lines.append(line.strip()) if line.find(";") > -1: break if len(lines) > 0: # if we read something, try to parse it as pdfloc or boundingbox try: job = ";".join(lines) jobs.append(self.parse_pdfloc_or_bounding_box_from_string(job)) break # let the loop process the parsed job before we read further except ValueError as e: print "Error parsing %s. Cause: %s" % (job, str(e)) break if len(jobs) == 0: # quit the loop when there are really no more jobs to be done break job = jobs.popleft() try: if isinstance(job, PDFLocPair): bboxes = PDFLocBoundingBoxes(converter.pdfloc_pair_to_bboxes(job), job.start.page, job.comment) if bboxes.page not in bboxes_result: bboxes_result[bboxes.page] = [] bboxes_result[bboxes.page].append(bboxes) #print "\n".join([str(bbox).strip() for bbox in bboxes.bboxes]) + "\n\n" else: pdfloc_pair = converter.bboxes_to_pdfloc_pair(job) #print str(pdfloc_pair) + "\n\n" except KeyError as e: print "Error converting %s. Cause: %s" % (job, repr(e)) previous_startxref = 515742 # TODO orig_pdf_size = os.path.getsize(args.filename.name)-1 xref_table = u"xref\n0 1\n0000000000 65535 f \n" for page_num in bboxes_result.keys(): bbox_list = bboxes_result[page_num] page_ref = converter._pdf_document.catalog['Pages'].resolve()['Kids'][page_num-1] page = page_ref.resolve() xref_table += u"%d 1\n%010d 00000 n \n" % (page_ref.objid, len(pdf_update_string)+orig_pdf_size) annots_objid = max_pdf_object_num + 1 max_pdf_object_num += 1 exiting_annots_objids = [] if 'Annots' in page: exiting_annots = page['Annots'] exiting_annots_objids = [annot.objid for annot in page['Annots']] pdf_update_string += u"%i 0 obj\n<<" \ u"/Type/Page" \ u"/Parent %i 0 R " \ u"/Resources %i 0 R" \ u"/MediaBox [%i %i %i %i]" \ u"/Group<</S/Transparency/CS/DeviceRGB/I true>>" \ u"/Contents %i 0 R" \ u"/Annots %d 0 R" \ u">>\nendobj\n" % (page_ref.objid, page['Parent'].objid, page['Resources'].objid, page['MediaBox'][0], page['MediaBox'][1], page['MediaBox'][2], page['MediaBox'][3], page['Contents'].objid, annots_objid) annots = range(max_pdf_object_num + 1, max_pdf_object_num + 1 + len(bbox_list)) max_pdf_object_num += len(bbox_list) xref_table += u"%d 1\n%010d 00000 n \n" % (annots_objid, len(pdf_update_string) + orig_pdf_size) items_refs = u" 0 R ".join(str(objid) for objid in exiting_annots_objids + annots) + u" 0 R" pdf_update_string += u"%d 0 obj [%s] endobj\n" % (annots_objid, items_refs) i = 0 for annotation_bboxes in bbox_list: objid = annots[i] xref_table += u"%d 1\n%010d 00000 n \n" % (objid, len(pdf_update_string) + orig_pdf_size) comment = annotation_bboxes.comment.decode('utf-8').replace(u")", u"\\)") first_box = annotation_bboxes.bboxes[0].bbox last_box = annotation_bboxes.bboxes[-1].bbox pdf_update_string += u"%d 0 obj\n<<" \ u"/Subtype /Highlight" \ u"/P %d 0 R" \ u"/C [1 1 0]" \ u"/F 4" \ u"/Contents (%s)" \ u"/Rect [%d %d %d %d] " \ u"/QuadPoints [" % ( objid, page_ref.objid, comment, min(first_box[0], last_box[0]), min(first_box[1], last_box[1]), max(first_box[2], last_box[2]), max(first_box[3], last_box[3]) ) j = 0 for bbox in annotation_bboxes.bboxes: bbox = bbox.bbox top = min(bbox[1], bbox[3]) bottom = max (bbox[1], bbox[3]) left = min(bbox[0], bbox[2]) right = max(bbox[0], bbox[2]) if top - bottom > 20: top = bottom + 20 pdf_update_string += u"%d %d %d %d %d %d %d %d " % ( left, bottom, right, bottom, left, top, right, top ) j += 1 pdf_update_string += u"]>>\nendobj\n" i += 1 pdf_update_string += u"\n" xref_position = orig_pdf_size + len(pdf_update_string) pdf_update_string = pdf_update_string + xref_table + u"\n" \ u"trailer\n<<\n/Size %d /Root %d 0 R /Prev %d\n>>\n" % \ (max_pdf_object_num+1, converter._pdf_document.xrefs[0].trailer['Root'].objid, previous_startxref) pdf_update_string += u"startxref\n%d\n%%%%EOF" % xref_position print pdf_update_string return 0
def bboxes_to_pdfloc(document, annotations): converter = PDFLocConverter(document.full_path, bboxes=annotations.bbox_annotations) converter.parse_document() for bbox in annotations.bbox_annotations: annotations.pdfloc_annotations.append(converter.bboxes_to_pdfloc_pair(bbox))