def parse_page_labels(page_labels: PdfArray, number_pages: int) -> List[str]: page_numbers = [] # add the final stop position page_labels.append(number_pages) for i in range(0, len(page_labels) - 1, 2): start, options, stop = page_labels[i:i + 3] stop = int(stop) start = int(start) # /S specifies the numbering style for page numbers: # /D - Arabic numerals (1,2,3...) # /r - lowercase Roman numerals (i, ii, iii,...) # /R - uppercase Roman numerals (I, II, III,...) # /A - uppercase letters (A-Z) # /a - lowercase letters (a-z) # /P (optional) - page number prefix # /St (optional) - the value of the first page number in the range (default: 1) page_offset = int(options.St or 1) page_range = range(page_offset, (stop - start) + 1) option_mapping = { "/D": str, "/r": lambda x: to_roman(x).lower(), "/R": to_roman, "/a": ascii_lowercase.__getitem__, "/A": ascii_uppercase.__getitem__, } range_numbers = map(option_mapping.get(options.S), page_range) if options.P is not None and options.P != "()": range_numbers = map(lambda x: options.P + x, range_numbers) page_numbers.extend(range_numbers) return page_numbers
def write_pdf_metadata(document, fileobj, scale, metadata, attachments, url_fetcher): """Append to a seekable file-like object to add PDF metadata.""" fileobj.seek(0) trailer = PdfReader(fileobj) pages = trailer.Root.Pages.Kids bookmarks, links = prepare_metadata(document, scale, pages) if bookmarks: bookmark_objects, count = create_bookmarks(bookmarks, pages) trailer.Root.Outlines = PdfDict(Type=PdfName('Outlines'), Count=count, First=bookmark_objects[0], Last=bookmark_objects[-1]) attachments = metadata.attachments + (attachments or []) if attachments: embedded_files = [] for attachment in attachments: attachment_object = _create_pdf_attachment(attachment, url_fetcher) if attachment_object is not None: embedded_files.append(PdfString.encode('attachment')) embedded_files.append(attachment_object) if embedded_files: trailer.Root.Names = PdfDict(EmbeddedFiles=PdfDict( Names=PdfArray(embedded_files))) # A single link can be split in multiple regions. We don't want to embedded # a file multiple times of course, so keep a reference to every embedded # URL and reuse the object number. # TODO: If we add support for descriptions this won't always be correct, # because two links might have the same href, but different titles. annot_files = {} for page_links in links: for link_type, target, rectangle in page_links: if link_type == 'attachment' and target not in annot_files: # TODO: use the title attribute as description annot_files[target] = _create_pdf_attachment((target, None), url_fetcher) # TODO: splitting a link into multiple independent rectangular annotations # works well for pure links, but rather mediocre for other annotations and # fails completely for transformed (CSS) or complex link shapes (area). # It would be better to use /AP for all links and coalesce link shapes that # originate from the same HTML link. This would give a feeling similiar to # what browsers do with links that span multiple lines. for page, page_links in zip(pages, links): annotations = PdfArray() for link_type, target, rectangle in page_links: if link_type != 'attachment' or annot_files[target] is None: annotation = PdfDict(Type=PdfName('Annot'), Subtype=PdfName('Link'), Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0))) if link_type == 'internal': destination = (target[0], PdfName('XYZ'), target[1], target[2], 0) annotation.A = PdfDict(Type=PdfName('Action'), S=PdfName('GoTo'), D=PdfArray(destination)) else: annotation.A = PdfDict(Type=PdfName('Action'), S=PdfName('URI'), URI=PdfString.encode( iri_to_uri(target))) else: assert annot_files[target] is not None ap = PdfDict(N=PdfDict(BBox=PdfArray(rectangle), Subtype=PdfName('Form'), Type=PdfName('XObject'))) # evince needs /T or fails on an internal assertion. PDF # doesn't require it. annotation = PdfDict(Type=PdfName('Annot'), Subtype=PdfName('FileAttachment'), T=PdfString.encode(''), Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0)), FS=annot_files[target], AP=ap) annotations.append(annotation) if annotations: page.Annots = annotations trailer.Info.Producer = VERSION_STRING for attr, key in (('title', 'Title'), ('description', 'Subject'), ('generator', 'Creator')): value = getattr(metadata, attr) if value is not None: setattr(trailer.Info, key, value) for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')): value = getattr(metadata, attr) if value is not None: setattr(trailer.Info, key, ', '.join(getattr(metadata, attr))) for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')): value = w3c_date_to_pdf(getattr(metadata, attr), attr) if value is not None: setattr(trailer.Info, key, value) for page, document_page in zip(pages, document.pages): left, top, right, bottom = (float(value) for value in page.MediaBox) # Convert pixels into points bleed = { key: value * 0.75 for key, value in document_page.bleed.items() } trim_left = left + bleed['left'] trim_top = top + bleed['top'] trim_right = right - bleed['right'] trim_bottom = bottom - bleed['bottom'] page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom)) # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and # CSS page box (PDF TrimBox), at most 10 points from the TrimBox. bleed_left = trim_left - min(10, bleed['left']) bleed_top = trim_top - min(10, bleed['top']) bleed_right = trim_right + min(10, bleed['right']) bleed_bottom = trim_bottom + min(10, bleed['bottom']) page.BleedBox = PdfArray( (bleed_left, bleed_top, bleed_right, bleed_bottom)) fileobj.seek(0) PdfWriter().write(fileobj, trailer=trailer) fileobj.truncate()
def do_apply_ocg(basepage, rmpage, i, uses_base_pdf, ocgprop, annotations): ocgpage = IndirectPdfDict(Type=PdfName('OCG'), Name='Page ' + str(i + 1)) ocgprop.OCGs.append(ocgpage) # The Order dict is a Page, followed by Inner ocgorderinner = PdfArray() # Add Template OCG layer # If this uses a basepdf, the template is located # elsewhere. # If using a basepdf, assign its stream as a # 'Background' layer under this page. When the page # primary OCG is disabled, the background will # remain, making it easy to disable all annotations. if uses_base_pdf: ocgorigdoc = IndirectPdfDict(Type=PdfName('OCG'), Name='Background') ocgprop.OCGs.append(ocgorigdoc) ocgorderinner.append(ocgorigdoc) uncompress.uncompress([basepage.Contents]) stream = basepage.Contents.stream stream = '/OC /ocgorigdoc BDC\n' \ + stream \ + 'EMC\n' basepage.Contents.stream = stream compress.compress([basepage.Contents]) if '/Properties' in basepage.Resources: props = basepage.Resources.Properties else: props = PdfDict() props.ocgorigdoc = ocgorigdoc basepage.Resources.Properties = props # If not using a basepdf, assign the rmpage's stream # as a 'Template' layer under this page. It will be # affected by disabling the primary Page OCG (which # by itself is kind of useless for exported # notebooks). # Regardless of using a basepdf or not, put the # rmpage layers into their own OCGs. # If the template has an XObject, we want to skip # the first one. This happens when the template # contains a PNG. Question--what happens when the # template contains more than one PNG? How do we # detect all of those? template_xobj_keys = [] vector_layers = [] uncompress.uncompress([rmpage.Contents]) if uses_base_pdf: # The entire thing is the page ocg stream = '/OC /ocgpage BDC\n' stream += rmpage.Contents.stream stream += 'EMC\n' rmpage.Contents.stream = stream else: stream = rmpage.Contents.stream # Mark the template ocg separate from page ocg template_endpos = 0 page_inatpos = 0 findkey = '1 w 2 J 2 j []0 d\nq\n' # Finds only the first instance, which should be # for the template. findloc = stream.find(findkey) if findloc < 0: # May be a vector, which we stick a marker # in for. # ?? Why is this a half-point off ?? findkey = '799.500000 85 l\n' m = re.search(findkey, rmpage.Contents.stream) if m: findloc = m.start() if findloc > 0: template_endpos = findloc + len(findkey) # Add vector template OCG stream = '/OC /ocgtemplate BDC\n' stream += rmpage.Contents.stream[:template_endpos] stream += 'EMC\n' page_inatpos = len(stream) stream += rmpage.Contents.stream[template_endpos:] # Save stream rmpage.Contents.stream = stream # Add template ocg ocgtemplate = IndirectPdfDict(Type=PdfName('OCG'), Name='Template') ocgprop.OCGs.append(ocgtemplate) ocgorderinner.append(ocgtemplate) # If a template (which is SVG) has embedded PNG # images, those appear as XObjects. This will # mess up the layer order, so we will ignore # them later. template_xobj_keys = \ re.findall(r'(\/Im[0-9]+)\s', stream[:template_endpos]) # Page ocg stream = rmpage.Contents.stream[:page_inatpos] stream += '/OC /ocgpage BDC\n' stream += rmpage.Contents.stream[page_inatpos:] stream += 'EMC\n' # Save stream rmpage.Contents.stream = stream # Find all other vector layers using the magic # point (DocumentPageLayer.render_to_painter()). # ?? Why is this a half-point off ?? while True: m = re.search('420.500000 69 m\n', rmpage.Contents.stream) if not m: break stream = '' layerid = 'ocglayer{}'.format(len(vector_layers) + 1) stream = rmpage.Contents.stream[:m.start()] if len(vector_layers): # close previous layer stream += 'EMC\n' stream += '/OC /{} BDC\n'.format(layerid) stream += rmpage.Contents.stream[m.end():] vector_layers.append(layerid) rmpage.Contents.stream = stream # If we added vector layers, have to end the # first one. if len(vector_layers): stream = rmpage.Contents.stream + 'EMC\n' rmpage.Contents.stream = stream # Done--recompress the stream. compress.compress([rmpage.Contents]) # There shouldn't be any Properties there since we # generated the rmpage ourselves, so don't bother # checking. rmpage.Resources.Properties = PdfDict(ocgpage=ocgpage) if not uses_base_pdf: rmpage.Resources.Properties.ocgtemplate = ocgtemplate # Add individual OCG layers (Bitmap) was_vector = True for n, key in enumerate(rmpage.Resources.XObject): if str(key) in template_xobj_keys: continue was_vector = False l = n - len(template_xobj_keys) # This would indicate a bug in the handling of a # notebook. try: layer = annotations[i][l] except: log.error( 'could not associate XObject with layer: (i, l) ({}, {})'. format(i, l)) log.error(str(annotations)) log.error('document: {} ()').format('uuid', 'self.visible_name') continue layername = layer[0] ocg = IndirectPdfDict(Type=PdfName('OCG'), Name=layername) ocgprop.OCGs.append(ocg) ocgorderinner.append(ocg) rmpage.Resources.XObject[key].OC = ocg # Add individual OCG layers (Vector) if was_vector: for l, layerid in enumerate(vector_layers): # This would indicate a bug in the handling of a # notebook. try: layer = annotations[i][l] except: log.error( 'could not associate layerid with layer: (i, l, layerid) ({}, {}, {})' .format(i, l, layerid)) log.error('document: {} ()').format('uuid', 'self.visible_name') log.error(str(annotations)) continue layername = layer[0] ocg = IndirectPdfDict(Type=PdfName('OCG'), Name=layername) ocgprop.OCGs.append(ocg) ocgorderinner.append(ocg) rmpage.Resources.Properties[PdfName(layerid)] = \ ocg # Add order of OCGs to primary document ocgprop.D.Order.append(ocgpage) ocgprop.D.Order.append(ocgorderinner) return ocgorderinner