def _create_pdf_attachment(attachment, url_fetcher): """ Create an attachment to the PDF stream :return: the object representing the ``/Filespec`` object or :obj:`None` if the attachment couldn't be read. """ try: # Attachments from document links like <link> or <a> can only be URLs. # They're passed in as tuples if isinstance(attachment, tuple): url, description = attachment attachment = Attachment(url=url, url_fetcher=url_fetcher, description=description) elif not isinstance(attachment, Attachment): attachment = Attachment(guess=attachment, url_fetcher=url_fetcher) with attachment.source as (source_type, source, url, _): if isinstance(source, bytes): source = io.BytesIO(source) pdf_file_object = _create_compressed_file_object(source) except URLFetchingError as exc: LOGGER.error('Failed to load attachment: %s', exc) return None # TODO: Use the result object from a URL fetch operation to provide more # details on the possible filename return PdfDict(Type=PdfName('Filespec'), F=PdfString.encode(''), UF=PdfString.encode(_get_filename_from_result(url, None)), EF=PdfDict(F=pdf_file_object), Desc=PdfString.encode(attachment.description or ''))
def create_bookmarks(bookmarks, pages, parent=None): count = len(bookmarks) bookmark_objects = [] for label, target, children in bookmarks: destination = (pages[target[0]].indirect, PdfName('XYZ'), target[1], target[2], 0) bookmark_object = PdfDict(Title=PdfString.encode(label), A=PdfDict(Type=PdfName('Action'), S=PdfName('GoTo'), D=PdfArray(destination))) bookmark_object.indirect = True children_objects, children_count = create_bookmarks( children, pages, parent=bookmark_object) bookmark_object.Count = 1 + children_count if bookmark_objects: bookmark_object.Prev = bookmark_objects[-1] bookmark_objects[-1].Next = bookmark_object if children_objects: bookmark_object.First = children_objects[0] bookmark_object.Last = children_objects[-1] if parent is not None: bookmark_object.Parent = parent count += children_count bookmark_objects.append(bookmark_object) return bookmark_objects, count
def encode(self, value): x = PdfString.encode(value) if isinstance(value, type(u'')): y = PdfString.from_unicode(value) else: y = PdfString.from_bytes(value) self.assertEqual(x, y) return x
def transPdfString(v, translator) : if isinstance(v, PdfString): if v[0]=="(": s0=v.decode() if s0.startswith("\xfe\xff"): #chardet.detect(s0)["encoding"]=="UTF-16BE": s1=translator(s0.decode("utf-16be", "ignore")) s2=PdfString.encode(s1.encode("utf-16be")) return PdfString(s2) return None
def pdfobjs(self): """Returns a tuple of two elements to insert in the PageLabels.Nums entry of a pdf""" pagenum = PdfObject(self.startpage) opts = PdfDict(S=styles[self.style]) if self.prefix != defaults["prefix"]: opts.P = PdfString.encode(self.prefix) if self.firstpagenum != defaults["firstpagenum"]: opts.St = PdfObject(self.firstpagenum) return (pagenum, opts)
def pdfobjs(self): """Returns a tuple of two elements to insert in the PageLabels.Nums entry of a pdf""" page_num = PdfObject(self.startpage) opts = PdfDict(S=styles[self.style]) if self.prefix != defaults["prefix"]: opts.P = PdfString.encode(self.prefix) if self.firstpagenum != defaults["firstpagenum"]: opts.St = PdfObject(self.firstpagenum) return page_num, opts
def transPdfString(v, translator): if isinstance(v, PdfString): if v[0] == "(": s0 = v.decode() if s0.startswith( "\xfe\xff"): #chardet.detect(s0)["encoding"]=="UTF-16BE": s1 = translator(s0.decode("utf-16be", "ignore")) s2 = PdfString.encode(s1.encode("utf-16be")) return PdfString(s2) return None
def write_pdf_metadata(document, fileobj, scale, metadata, attachments, url_fetcher): """Append to a seekable file-like object to add PDF metadata.""" fileobj.seek(0) trailer = PdfReader(fileobj) pages = trailer.Root.Pages.Kids bookmarks, links = prepare_metadata(document, scale, pages) if bookmarks: bookmark_objects, count = create_bookmarks(bookmarks, pages) trailer.Root.Outlines = PdfDict(Type=PdfName('Outlines'), Count=count, First=bookmark_objects[0], Last=bookmark_objects[-1]) attachments = metadata.attachments + (attachments or []) if attachments: embedded_files = [] for attachment in attachments: attachment_object = _create_pdf_attachment(attachment, url_fetcher) if attachment_object is not None: embedded_files.append(PdfString.encode('attachment')) embedded_files.append(attachment_object) if embedded_files: trailer.Root.Names = PdfDict(EmbeddedFiles=PdfDict( Names=PdfArray(embedded_files))) # A single link can be split in multiple regions. We don't want to embedded # a file multiple times of course, so keep a reference to every embedded # URL and reuse the object number. # TODO: If we add support for descriptions this won't always be correct, # because two links might have the same href, but different titles. annot_files = {} for page_links in links: for link_type, target, rectangle in page_links: if link_type == 'attachment' and target not in annot_files: # TODO: use the title attribute as description annot_files[target] = _create_pdf_attachment((target, None), url_fetcher) # TODO: splitting a link into multiple independent rectangular annotations # works well for pure links, but rather mediocre for other annotations and # fails completely for transformed (CSS) or complex link shapes (area). # It would be better to use /AP for all links and coalesce link shapes that # originate from the same HTML link. This would give a feeling similiar to # what browsers do with links that span multiple lines. for page, page_links in zip(pages, links): annotations = PdfArray() for link_type, target, rectangle in page_links: if link_type != 'attachment' or annot_files[target] is None: annotation = PdfDict(Type=PdfName('Annot'), Subtype=PdfName('Link'), Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0))) if link_type == 'internal': destination = (target[0], PdfName('XYZ'), target[1], target[2], 0) annotation.A = PdfDict(Type=PdfName('Action'), S=PdfName('GoTo'), D=PdfArray(destination)) else: annotation.A = PdfDict(Type=PdfName('Action'), S=PdfName('URI'), URI=PdfString.encode( iri_to_uri(target))) else: assert annot_files[target] is not None ap = PdfDict(N=PdfDict(BBox=PdfArray(rectangle), Subtype=PdfName('Form'), Type=PdfName('XObject'))) # evince needs /T or fails on an internal assertion. PDF # doesn't require it. annotation = PdfDict(Type=PdfName('Annot'), Subtype=PdfName('FileAttachment'), T=PdfString.encode(''), Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0)), FS=annot_files[target], AP=ap) annotations.append(annotation) if annotations: page.Annots = annotations trailer.Info.Producer = VERSION_STRING for attr, key in (('title', 'Title'), ('description', 'Subject'), ('generator', 'Creator')): value = getattr(metadata, attr) if value is not None: setattr(trailer.Info, key, value) for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')): value = getattr(metadata, attr) if value is not None: setattr(trailer.Info, key, ', '.join(getattr(metadata, attr))) for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')): value = w3c_date_to_pdf(getattr(metadata, attr), attr) if value is not None: setattr(trailer.Info, key, value) for page, document_page in zip(pages, document.pages): left, top, right, bottom = (float(value) for value in page.MediaBox) # Convert pixels into points bleed = { key: value * 0.75 for key, value in document_page.bleed.items() } trim_left = left + bleed['left'] trim_top = top + bleed['top'] trim_right = right - bleed['right'] trim_bottom = bottom - bleed['bottom'] page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom)) # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and # CSS page box (PDF TrimBox), at most 10 points from the TrimBox. bleed_left = trim_left - min(10, bleed['left']) bleed_top = trim_top - min(10, bleed['top']) bleed_right = trim_right + min(10, bleed['right']) bleed_bottom = trim_bottom + min(10, bleed['bottom']) page.BleedBox = PdfArray( (bleed_left, bleed_top, bleed_right, bleed_bottom)) fileobj.seek(0) PdfWriter().write(fileobj, trailer=trailer) fileobj.truncate()