def insert_page( self, page: Page, index: typing.Optional[int] = None ) -> "Document": # type: ignore [name-defined] """ This method appends a page (from another Document) to this Document at a given index """ # build XRef if "XRef" not in self: self[Name("XRef")] = PlainTextXREF() self[Name("XRef")].set_parent(self) # build Trailer if "Trailer" not in self["XRef"]: self["XRef"][Name("Trailer")] = Dictionary() self["XRef"][Name("Size")] = Decimal(0) self["XRef"]["Trailer"].set_parent(self["XRef"]) # build Root if "Root" not in self["XRef"]["Trailer"]: self["XRef"]["Trailer"][Name("Root")] = Dictionary() self["XRef"]["Trailer"]["Root"].set_parent(self["XRef"]["Trailer"]) # build Pages if "Pages" not in self["XRef"]["Trailer"]["Root"]: self["XRef"]["Trailer"][Name("Root")][Name("Pages")] = Dictionary() self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name( "Count")] = Decimal(0) self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name( "Kids")] = List() self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name( "Type")] = Name("Pages") self["XRef"]["Trailer"]["Root"]["Pages"].set_parent( self["XRef"]["Trailer"]["Root"]) self["XRef"]["Trailer"]["Root"]["Pages"]["Kids"].set_parent( self["XRef"]["Trailer"]["Root"]["Pages"]) # update /Kids kids = self["XRef"]["Trailer"]["Root"]["Pages"]["Kids"] assert kids is not None assert isinstance(kids, List) if index is None: index = len(kids) kids.insert(index, page) # update /Count prev_count = self["XRef"]["Trailer"]["Root"]["Pages"]["Count"] self["XRef"]["Trailer"]["Root"]["Pages"][Name("Count")] = Decimal( prev_count + 1) # set /Parent page[Name("Parent")] = self["XRef"]["Trailer"]["Root"]["Pages"] page.set_parent(kids) # type: ignore [attr-defined] # return return self
def read_dictionary(self) -> Dictionary: """ This function processes the next tokens and returns a Dictionary. It fails and throws various errors if the next tokens do not represent a Dictionary. """ token = self.next_non_comment_token() assert token is not None assert token.token_type == TokenType.START_DICT out_dict = Dictionary() while True: # attempt to read name token token = self.next_non_comment_token() assert token is not None if token.token_type == TokenType.END_DICT: break assert token.token_type == TokenType.NAME # store name name = Name(token.text[1:]) # attempt to read value value = self.read_object() assert value is not None # store in dict object if name is not None: out_dict[name] = value return out_dict
def _test_document(self, file) -> bool: # create output directory if it does not exist yet if not self.output_dir.exists(): self.output_dir.mkdir() doc = None with open(file, "rb") as pdf_file_handle: doc = None with open(file, "rb") as pdf_file_handle: doc = PDF.loads(pdf_file_handle) if "XRef" not in doc: return False if "Trailer" not in doc["XRef"]: return False if "Info" not in doc["XRef"]["Trailer"]: doc["XRef"]["Trailer"][Name("Info")] = Dictionary() # change producer doc["XRef"]["Trailer"]["Info"][Name("Producer")] = String("pText") # determine output location out_file = self.output_dir / (file.stem + "_out.pdf") with open(out_file, "wb") as pdf_file_handle: PDF.dumps(pdf_file_handle, doc) return True
def _read_trailer( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ) -> Dictionary: # return None if there is no trailer token = tok.next_non_comment_token() assert token is not None if token.text != "trailer": return Dictionary() # if there is a keyword "trailer" the next token should be TokenType.START_DICT token = tok.next_non_comment_token() assert token is not None assert token.token_type == TokenType.START_DICT # go back 2 chars "<<" src.seek(-2, io.SEEK_CUR) # read dictionary as trailer trailer_dict = tok.read_dictionary() # process startxref token = tok.next_non_comment_token() assert token is not None assert token.token_type == TokenType.OTHER assert token.text == "startxref" # return return trailer_dict
def _get_font_resource_name(self, font: Font, page: Page): # create resources if needed if "Resources" not in page: page[Name("Resources")] = Dictionary().set_parent(page) # type: ignore [attr-defined] if "Font" not in page["Resources"]: page["Resources"][Name("Font")] = Dictionary() # insert font into resources font_resource_name = [ k for k, v in page["Resources"]["Font"].items() if v == font ] if len(font_resource_name) > 0: return font_resource_name[0] else: font_index = len(page["Resources"]["Font"]) + 1 page["Resources"]["Font"][Name("F%d" % font_index)] = font return Name("F%d" % font_index)
def _get_image_resource_name(self, image: PILImage, page: Page): # create resources if needed if "Resources" not in page: page[Name("Resources")] = Dictionary().set_parent( page) # type: ignore [attr-defined] if "XObject" not in page["Resources"]: page["Resources"][Name("XObject")] = Dictionary() # insert font into resources image_resource_name = [ k for k, v in page["Resources"]["XObject"].items() if v == image ] if len(image_resource_name) > 0: return image_resource_name[0] else: image_index = len(page["Resources"]["XObject"]) + 1 page["Resources"]["XObject"][Name("Im%d" % image_index)] = image return Name("Im%d" % image_index)
def transform( self, object_to_transform: Any, context: Optional[WriteTransformerContext] = None, ): """ This method writes a Document object to a byte stream """ # write header assert context is not None assert context.destination is not None context.destination.write(b"%PDF-1.7\n") context.destination.write(b"%") context.destination.write(bytes([226, 227, 207, 211])) context.destination.write(b"\n") # invalidate all references WritePDFTransformer._invalidate_all_references(object_to_transform) # create Info dictionary if needed if "Info" not in object_to_transform["XRef"]["Trailer"]: object_to_transform["XRef"]["Trailer"][Name("Info")] = Dictionary() # set /ID random_id = HexadecimalString("%032x" % random.randrange(16**32)) if "ID" not in object_to_transform["XRef"]["Trailer"]: object_to_transform["XRef"]["Trailer"][Name("ID")] = List( ).set_can_be_referenced( # type: ignore [attr-defined] False) object_to_transform["XRef"]["Trailer"]["ID"].append(random_id) object_to_transform["XRef"]["Trailer"]["ID"].append(random_id) else: object_to_transform["XRef"]["Trailer"]["ID"][1] = random_id object_to_transform["XRef"]["Trailer"]["ID"].set_can_be_referenced( False) # set CreationDate modification_date = WritePDFTransformer._timestamp_to_str() if "CreationDate" not in object_to_transform["XRef"]["Trailer"][Name( "Info")]: object_to_transform["XRef"]["Trailer"][Name("Info")][Name( "CreationDate")] = String(modification_date) # set ModDate object_to_transform["XRef"]["Trailer"]["Info"][Name( "ModDate")] = String(modification_date) # set Producer object_to_transform["XRef"]["Trailer"]["Info"][Name( "Producer")] = String("pText") # transform XREF self.get_root_transformer().transform(object_to_transform["XRef"], context)
def test_hash_types(self): obj0 = Dictionary() obj0[Name("Root")] = Reference(object_number=10) obj0[Name("Marked")] = Boolean(True) obj1 = List() obj1.append(Name("Red")) obj1.append(Decimal(0.5)) print(hash(obj1))
def transform( self, object_to_transform: AnyPDFType, context: Optional[WriteTransformerContext] = None, ): """ This method writes a Dictionary to a byte stream """ assert isinstance(object_to_transform, Dictionary) assert context is not None assert context.destination is not None assert context.destination # avoid resolving objects twice object_ref: typing.Optional[ Reference] = object_to_transform.get_reference( ) # type: ignore [attr-defined] if object_ref is not None and object_ref in context.resolved_references: assert object_ref is not None assert object_ref.object_number is not None logger.debug( "skip writing object %d %d R (already resolved)" % (object_ref.object_number, object_ref.generation_number or 0)) return # output value out_value = Dictionary() # objects to turn into reference queue: typing.List[AnyPDFType] = [] for k, v in object_to_transform.items(): if (isinstance(v, Dictionary) or isinstance(v, List) or isinstance(v, Stream) or isinstance(v, Image) or isinstance(v, Element) ) and v.can_be_referenced(): # type: ignore [union-attr] out_value[k] = self.get_reference(v, context) queue.append(v) else: out_value[k] = v # start object if needed started_object = False if object_ref is not None: assert object_ref.object_number is not None if object_ref.object_number is not None and object_ref.byte_offset is None: started_object = True self.start_object(object_to_transform, context) context.resolved_references.append(object_ref) # write dictionary at current location context.destination.write(bytes("<<", "latin1")) N = len(out_value.items()) for i, (k, v) in enumerate(out_value.items()): self.get_root_transformer().transform(k, context) context.destination.write(bytes(" ", "latin1")) self.get_root_transformer().transform(v, context) if i != N - 1: context.destination.write(bytes(" ", "latin1")) context.destination.write(bytes(">>\n", "latin1")) # end object if needed if started_object: self.end_object(object_to_transform, context) for e in queue: self.get_root_transformer().transform(e, context) # return return out_value
def add_outline( self, text: str, level: int, destination_type: DestinationType, page_nr: int, top: typing.Optional[Decimal] = None, right: typing.Optional[Decimal] = None, bottom: typing.Optional[Decimal] = None, left: typing.Optional[Decimal] = None, zoom: typing.Optional[Decimal] = None, ) -> "Document": """ A PDF document may contain a document outline that the conforming reader may display on the screen, allowing the user to navigate interactively from one part of the document to another. The outline consists of a tree-structured hierarchy of outline items (sometimes called bookmarks), which serve as a visual table of contents to display the document’s structure to the user. This function adds an outline to this Document """ destination = List().set_can_be_referenced( False) # type: ignore [attr-defined] destination.append(Decimal(page_nr)) destination.append(destination_type.value) if destination_type == DestinationType.X_Y_Z: assert (left is not None and bottom is None and right is None and top is not None and zoom is not None) destination.append(Decimal(left)) destination.append(Decimal(top)) destination.append(Decimal(zoom)) if destination_type == DestinationType.FIT: assert (left is None and bottom is None and right is None and top is None and zoom is None) if destination_type == DestinationType.FIT_H: assert (left is None and bottom is None and right is None and top is not None and zoom is None) destination.append(Decimal(top)) if destination_type == DestinationType.FIT_V: assert (left is not None and bottom is None and right is None and top is None and zoom is None) destination.append(Decimal(left)) if destination_type == DestinationType.FIT_R: assert (left is not None and bottom is not None and right is not None and top is not None and zoom is None) destination.append(Decimal(left)) destination.append(Decimal(bottom)) destination.append(Decimal(right)) destination.append(Decimal(top)) if destination_type == DestinationType.FIT_B_H: assert (left is None and bottom is None and right is None and top is not None and zoom is None) destination.append(Decimal(top)) if destination_type == DestinationType.FIT_B_V: assert (left is not None and bottom is None and right is None and top is None and zoom is None) destination.append(Decimal(left)) # add \Outlines entry in \Root if "Outlines" not in self["XRef"]["Trailer"]["Root"]: outline_dictionary: Dictionary = Dictionary() self["XRef"]["Trailer"]["Root"][Name( "Outlines")] = outline_dictionary outline_dictionary.set_parent( # type: ignore [attr-defined] self["XRef"]["Trailer"]["Root"][Name("Outlines")]) outline_dictionary[Name("Type")] = Name("Outlines") outline_dictionary[Name("Count")] = Decimal(0) # create entry outline = Dictionary() outline[Name("Dest")] = destination outline[Name("Parent")] = None outline[Name("Title")] = String(text) # get \Outlines outline_dictionary = self["XRef"]["Trailer"]["Root"]["Outlines"] # if everything is empty, add the new entry as the only entry if "First" not in outline_dictionary or "Last" not in outline_dictionary: outline_dictionary[Name("First")] = outline outline_dictionary[Name("Last")] = outline outline_dictionary[Name("Count")] = Decimal(1) outline[Name("Parent")] = outline_dictionary return self # helper function to make DFS easier def _children(x: Dictionary): if "First" not in x: return [] children = [x["First"]] while children[-1] != x["Last"]: children.append(children[-1]["Next"]) return children # DFS outline(s) outlines_done: typing.List[typing.Tuple[int, Dictionary]] = [] outlines_todo: typing.List[typing.Tuple[int, Dictionary]] = [ (-1, outline_dictionary) ] while len(outlines_todo) > 0: t = outlines_todo[0] outlines_done.append(t) outlines_todo.pop(0) for c in _children(t[1]): outlines_todo.append((t[0] + 1, c)) # find parent parent = [x[1] for x in outlines_done if x[0] == level - 1][-1] # update sibling-linking if "Last" in parent: sibling = parent["Last"] sibling[Name("Next")] = outline # update parent-linking outline[Name("Parent")] = parent if "First" not in parent: parent[Name("First")] = outline if "Count" not in parent: parent[Name("Count")] = Decimal(0) parent[Name("Last")] = outline # update count outline_to_update_count = parent while outline_to_update_count: outline_to_update_count[Name("Count")] = Decimal( outline_to_update_count["Count"] + Decimal(1)) if "Parent" in outline_to_update_count: outline_to_update_count = outline_to_update_count["Parent"] else: break return self
def transform( self, object_to_transform: AnyPDFType, context: Optional[WriteTransformerContext] = None, ): """ This method writes an XREF to a byte stream """ assert isinstance(object_to_transform, XREF) assert "Trailer" in object_to_transform assert isinstance(object_to_transform["Trailer"], Dictionary) assert context is not None assert context.destination is not None # Transform the Trailer dictionary (replacing objects by references) # we do this upfront because the normal write_dictionary_transformer will write the dictionary first, # and the references afterwards. This would cause the \Trailer dictionary to not be the last. trailer_out = Dictionary() # /Root trailer_out[Name("Root")] = self.get_reference( object_to_transform["Trailer"]["Root"], context) # /Info if "Info" in object_to_transform["Trailer"]: trailer_out[Name("Info")] = self.get_reference( object_to_transform["Trailer"]["Info"], context) # /Size if ("Trailer" in object_to_transform and "Size" in object_to_transform["Trailer"]): trailer_out[Name("Size")] = object_to_transform["Trailer"]["Size"] else: trailer_out[Name("Size")] = Decimal( 0) # we'll recalculate this later anyway # /ID if "ID" in object_to_transform["Trailer"]: trailer_out[Name("ID")] = object_to_transform["Trailer"]["ID"] # write /Root object self.get_root_transformer().transform( object_to_transform["Trailer"]["Root"], context) # write /Info object if "Info" in object_to_transform["Trailer"]: self.get_root_transformer().transform( object_to_transform["Trailer"]["Info"], context) # write /XREF start_of_xref = context.destination.tell() context.destination.write(bytes("xref\n", "latin1")) for section in self._section_xref(context): context.destination.write( bytes("%d %d\n" % (section[0].object_number, len(section)), "latin1")) for r in section: if r.is_in_use: context.destination.write( bytes("{0:010d} 00000 n\r\n".format(r.byte_offset), "latin1")) else: context.destination.write( bytes("{0:010d} 00000 f\r\n".format(r.byte_offset), "latin1")) # update /Size trailer_out[Name("Size")] = Decimal( sum([len(v) for k, v in context.indirect_objects_by_hash.items()]) + 1) # write /Trailer context.destination.write(bytes("trailer\n", "latin1")) self.get_root_transformer().transform(trailer_out, context) context.destination.write(bytes("startxref\n", "latin1")) # write byte offset of last cross-reference section context.destination.write(bytes(str(start_of_xref) + "\n", "latin1")) # write EOF context.destination.write(bytes("%%EOF", "latin1"))
def append_embedded_file(self, file_name: str, file_bytes: bytes, apply_compression: bool = True) -> "Document": """ If a PDF file contains file specifications that refer to an external file and the PDF file is archived or transmitted, some provision should be made to ensure that the external references will remain valid. One way to do this is to arrange for copies of the external files to accompany the PDF file. Embedded file streams (PDF 1.3) address this problem by allowing the contents of referenced files to be embedded directly within the body of the PDF file. This makes the PDF file a self-contained unit that can be stored or transmitted as a single entity. (The embedded files are included purely for convenience and need not be directly processed by any conforming reader.) This method embeds a file (specified by its name and bytes) into this Document """ assert "XRef" in self assert "Trailer" in self["XRef"] assert "Root" in self["XRef"]["Trailer"] root = self["XRef"]["Trailer"]["Root"] # set up /Names dictionary if "Names" not in root: root[Name("Names")] = Dictionary() names = root["Names"] # set up /EmbeddedFiles if "EmbeddedFiles" not in names: names[Name("EmbeddedFiles")] = Dictionary() names["EmbeddedFiles"][Name("Kids")] = List() # find parent parent = names["EmbeddedFiles"] while "Kids" in parent: for k in parent["Kids"]: lower_limit = str(k["Limits"][0]) upper_limit = str(k["Limits"][1]) if lower_limit == upper_limit: continue if lower_limit < file_name < upper_limit: parent = k break break # add new child if (len([ x for x in parent["Kids"] if x["Limits"][0] == x["Limits"][1] == file_name ]) == 0): kid = Dictionary() kid[Name("F")] = String(file_name) kid[Name("Type")] = Name("Filespec") kid[Name("Limits")] = List() for _ in range(0, 2): kid["Limits"].append(String(file_name)) # build leaf \Names dictionary names = List() names.append(String(file_name)) kid[Name("Names")] = names # build actual file stream stream = Stream() stream[Name("Type")] = Name("EmbeddedFile") stream[Name("DecodedBytes")] = file_bytes if not apply_compression: stream[Name("Bytes")] = file_bytes else: stream[Name("Bytes")] = zlib.compress( stream[Name("DecodedBytes")], 9) stream[Name("Filter")] = Name("FlateDecode") stream[Name("Length")] = Decimal(len(stream[Name("Bytes")])) # build leaf \Filespec dictionary file_spec = Dictionary() file_spec[Name("EF")] = Dictionary() file_spec["EF"][Name("F")] = stream file_spec[Name("F")] = String(file_name) file_spec[Name("Type")] = Name("Filespec") names.append(file_spec) # append parent["Kids"].append(kid) # change existing child else: kid = [ x for x in parent["Kids"] if x["Limits"][0] == x["Limits"][1] == file_name ][0] # TODO # return return self
def add_outline( self, text: str, level: int, destination_type: DestinationType, page_nr: int, top: typing.Optional[Decimal] = None, right: typing.Optional[Decimal] = None, bottom: typing.Optional[Decimal] = None, left: typing.Optional[Decimal] = None, zoom: typing.Optional[Decimal] = None, ) -> "Document": destination = List().set_can_be_referenced( False) # type: ignore [attr-defined] destination.append(Decimal(page_nr)) destination.append(destination_type.value) if destination_type == DestinationType.X_Y_Z: assert (left is not None and bottom is None and right is None and top is not None and zoom is not None) destination.append(Decimal(left)) destination.append(Decimal(top)) destination.append(Decimal(zoom)) if destination_type == DestinationType.FIT: assert (left is None and bottom is None and right is None and top is None and zoom is None) if destination_type == DestinationType.FIT_H: assert (left is None and bottom is None and right is None and top is not None and zoom is None) destination.append(Decimal(top)) if destination_type == DestinationType.FIT_V: assert (left is not None and bottom is None and right is None and top is None and zoom is None) destination.append(Decimal(left)) if destination_type == DestinationType.FIT_R: assert (left is not None and bottom is not None and right is not None and top is not None and zoom is None) destination.append(Decimal(left)) destination.append(Decimal(bottom)) destination.append(Decimal(right)) destination.append(Decimal(top)) if destination_type == DestinationType.FIT_B_H: assert (left is None and bottom is None and right is None and top is not None and zoom is None) destination.append(Decimal(top)) if destination_type == DestinationType.FIT_B_V: assert (left is not None and bottom is None and right is None and top is None and zoom is None) destination.append(Decimal(left)) # add \Outlines entry in \Root if "Outlines" not in self["XRef"]["Trailer"]["Root"]: outline_dictionary: Dictionary = Dictionary() self["XRef"]["Trailer"]["Root"][Name( "Outlines")] = outline_dictionary outline_dictionary.set_parent( # type: ignore [attr-defined] self["XRef"]["Trailer"]["Root"][Name("Outlines")]) outline_dictionary[Name("Type")] = Name("Outlines") outline_dictionary[Name("Count")] = Decimal(0) # create entry outline = Dictionary() outline[Name("Dest")] = destination outline[Name("Parent")] = None outline[Name("Title")] = String(text) # get \Outlines outline_dictionary = self["XRef"]["Trailer"]["Root"]["Outlines"] # if everything is empty, add the new entry as the only entry if "First" not in outline_dictionary or "Last" not in outline_dictionary: outline_dictionary[Name("First")] = outline outline_dictionary[Name("Last")] = outline outline_dictionary[Name("Count")] = Decimal(1) outline[Name("Parent")] = outline_dictionary return self # helper function to make DFS easier def _children(x: Dictionary): if "First" not in x: return [] children = [x["First"]] while children[-1] != x["Last"]: children.append(children[-1]["Next"]) return children # DFS outline(s) outlines_done: typing.List[typing.Tuple[int, Dictionary]] = [] outlines_todo: typing.List[typing.Tuple[int, Dictionary]] = [ (-1, outline_dictionary) ] while len(outlines_todo) > 0: t = outlines_todo[0] outlines_done.append(t) outlines_todo.pop(0) for c in _children(t[1]): outlines_todo.append((t[0] + 1, c)) # find parent parent = [x[1] for x in outlines_done if x[0] == level - 1][-1] # update sibling-linking if "Last" in parent: sibling = parent["Last"] sibling[Name("Next")] = outline # update parent-linking outline[Name("Parent")] = parent if "First" not in parent: parent[Name("First")] = outline if "Count" not in parent: parent[Name("Count")] = Decimal(0) parent[Name("Last")] = outline # update count outline_to_update_count = parent while outline_to_update_count: outline_to_update_count[Name("Count")] = Decimal( outline_to_update_count["Count"] + Decimal(1)) if "Parent" in outline_to_update_count: outline_to_update_count = outline_to_update_count["Parent"] else: break return self
def transform( self, object_to_transform: AnyPDFType, context: Optional[WriteTransformerContext] = None, ): """ This method writes a Stream to a byte stream """ assert context is not None assert context.destination is not None assert isinstance(object_to_transform, Stream) # avoid resolving objects twice object_ref: typing.Optional[ Reference] = object_to_transform.get_reference( ) # type: ignore [attr-defined] if object_ref is not None and object_ref in context.resolved_references: assert object_ref is not None assert object_ref.object_number is not None logger.debug( "skip writing object %d %d R (already resolved)" % (object_ref.object_number, object_ref.generation_number or 0)) return # start object if needed started_object = False if object_ref is not None: assert object_ref.object_number is not None if object_ref.object_number is not None and object_ref.byte_offset is None: started_object = True self.start_object(object_to_transform, context) context.resolved_references.append(object_ref) # build stream dictionary stream_dictionary = Dictionary() # objects to turn into reference queue: typing.List[AnyPDFType] = [] for k, v in object_to_transform.items(): if k in ["Bytes", "DecodedBytes"]: continue if (isinstance(v, Dictionary) or isinstance(v, List) or isinstance(v, Stream) ) and v.can_be_referenced(): # type: ignore [union-attr] stream_dictionary[k] = self.get_reference(v, context) queue.append(v) else: stream_dictionary[k] = v # if self.compression_level == 0, remove \Filter if context.compression_level == 0 and Name( "Filter") in stream_dictionary: stream_dictionary.pop(Name("Filter")) # handle compression if "DecodedBytes" in object_to_transform: if context.compression_level == 0: bts = object_to_transform["DecodedBytes"] else: bts = zlib.compress(object_to_transform["DecodedBytes"], context.compression_level) stream_dictionary[Name("Length")] = pDecimal(len(bts)) else: assert "Bytes" in object_to_transform bts = object_to_transform["Bytes"] # write stream dictionary self.get_root_transformer().transform(stream_dictionary, context) # write "stream" context.destination.write(bytes("stream\n", "latin1")) # write bytes context.destination.write(bts) # write "endstream" context.destination.write(bytes("\nendstream\n", "latin1")) # end object if needed if started_object: self.end_object(object_to_transform, context) for e in queue: self.get_root_transformer().transform(e, context)
def true_type_font_from_file(path_to_font_file: Path) -> "TrueTypeFont": """ This function returns the PDF TrueTypeFont object for a given TTF file """ assert path_to_font_file.exists() assert path_to_font_file.name.endswith(".ttf") font_file_bytes: typing.Optional[bytes] = None with open(path_to_font_file, "rb") as ffh: font_file_bytes = ffh.read() assert font_file_bytes # read file ttf_font_file = TTFont(path_to_font_file) # build font font: TrueTypeFont = TrueTypeFont() font_name: str = str( [ x for x in ttf_font_file["name"].names if x.platformID == 3 and x.nameID == 1 ][0].string, "latin1", ) font_name = "".join([ x for x in font_name if x.lower() in "abcdefghijklmnopqrstuvwxyz" ]) font[Name("Name")] = Name(font_name) font[Name("BaseFont")] = Name(font_name) cmap: typing.Optional[typing.Dict[int, str]] = ttf_font_file.getBestCmap() cmap_reverse: typing.Dict[str, int] = {} for k, v in cmap.items(): if v in cmap_reverse: cmap_reverse[v] = min(cmap_reverse[v], k) else: cmap_reverse[v] = k glyph_order: typing.List[str] = [ x for x in ttf_font_file.glyphOrder if x in cmap_reverse ] # build widths units_per_em: pDecimal = pDecimal(ttf_font_file["head"].unitsPerEm) if cmap is not None: font[Name("FirstChar")] = pDecimal(0) font[Name("LastChar")] = pDecimal(len(glyph_order)) font[Name("Widths")] = List() for glyph_name in glyph_order: w: pDecimal = ( pDecimal(ttf_font_file.getGlyphSet()[glyph_name].width) / units_per_em) * pDecimal(1000) w = pDecimal(round(w, 2)) font["Widths"].append(w) font[Name("FontDescriptor")] = Dictionary() font["FontDescriptor"][Name("Type")] = Name("FontDescriptor") font["FontDescriptor"][Name("FontName")] = String(font_name) font["FontDescriptor"][Name("FontStretch")] = Name("Normal") # TODO font["FontDescriptor"][Name("FontWeight")] = pDecimal(400) # TODO font["FontDescriptor"][Name("Flags")] = pDecimal(4) # TODO font["FontDescriptor"][Name("FontBBox")] = List( ).set_can_be_referenced( # type: ignore [attr-defined] False) # TODO for _ in range(0, 4): font["FontDescriptor"]["FontBBox"].append(pDecimal(0)) # fmt: off font["FontDescriptor"][Name("ItalicAngle")] = pDecimal( ttf_font_file["post"].italicAngle) font["FontDescriptor"][Name("Ascent")] = pDecimal( pDecimal(ttf_font_file["hhea"].ascent) / units_per_em * Decimal(1000)) font["FontDescriptor"][Name("Descent")] = pDecimal( pDecimal(ttf_font_file["hhea"].descent) / units_per_em * Decimal(1000)) font["FontDescriptor"][Name("CapHeight")] = pDecimal(0) # TODO font["FontDescriptor"][Name("StemV")] = pDecimal(0) # TODO # fmt: on font[Name("Encoding")] = Dictionary() font["Encoding"][Name("BaseEncoding")] = Name("WinAnsiEncoding") font["Encoding"][Name("Differences")] = List() for i in range(0, len(glyph_order)): font["Encoding"]["Differences"].append(pDecimal(i)) font["Encoding"]["Differences"].append(Name(glyph_order[i])) # embed font file font_stream: Stream = Stream() font_stream[Name("Type")] = Name("Font") font_stream[Name("Subtype")] = Name("TrueType") font_stream[Name("Length")] = pDecimal(len(font_file_bytes)) font_stream[Name("Length1")] = pDecimal(len(font_file_bytes)) font_stream[Name("Filter")] = Name("FlateDecode") font_stream[Name("DecodedBytes")] = font_file_bytes font_stream[Name("Bytes")] = zlib.compress(font_file_bytes, 9) font["FontDescriptor"][Name("FontFile2")] = font_stream # return return font
def decode_stream(s: Stream) -> Stream: """ This function decodes a Stream, applying the filters specified in the Filter entry of its stream dictionary """ assert isinstance(s, Stream) assert "Bytes" in s # determine filter(s) to apply filters: typing.List[str] = [] if "Filter" in s: if isinstance(s["Filter"], List): filters = s["Filter"] else: filters = [s["Filter"]] decode_params: typing.List[Dictionary] = [] if "DecodeParms" in s: if isinstance(s["DecodeParms"], List): decode_params = s["DecodeParms"] else: assert s["DecodeParms"] is not None assert isinstance(s["DecodeParms"], Dictionary) decode_params = [s["DecodeParms"]] else: decode_params = [Dictionary() for x in range(0, len(filters))] # apply filter(s) transformed_bytes = s["Bytes"] for filter_index, filter_name in enumerate(filters): # FLATE if filter_name in ["FlateDecode", "Fl"]: transformed_bytes = FlateDecode.decode( bytes_in=transformed_bytes, columns=int(decode_params[filter_index].get("Columns", Decimal(1))), predictor=int(decode_params[filter_index].get("Predictor", Decimal(1))), bits_per_component=int( decode_params[filter_index].get("BitsPerComponent", Decimal(8)) ), ) continue # ASCII85 if filter_name in ["ASCII85Decode"]: transformed_bytes = ASCII85Decode.decode(transformed_bytes) continue # LZW if filter_name in ["LZWDecode"]: transformed_bytes = LZWDecode.decode(transformed_bytes) continue # RunLengthDecode if filter_name in ["RunLengthDecode"]: transformed_bytes = RunLengthDecode.decode(transformed_bytes) continue # unknown filter assert False, "Unknown /Filter %s" % filter_name # set DecodedBytes s[Name("DecodedBytes")] = transformed_bytes # set Type if not yet set # if "Type" not in s: # s[Name("Type")] = Name("Stream") # return return s
def read( self, io_source: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tokenizer: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": """ This method attempts to read a stream XREF from the given io_source. It will either throw an exception, or return this XREF """ if initial_offset is not None: io_source.seek(initial_offset) else: self._seek_to_xref_token(io_source, tokenizer) xref_stream = tokenizer.read_object() assert isinstance(xref_stream, Stream) # check widths assert "W" in xref_stream assert all([ isinstance(xref_stream["W"][x], Decimal) for x in range(0, len(xref_stream["W"])) ]) # decode widths widths = [ int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"])) ] total_entry_width = sum(widths) # parent document = self.get_root() # type: ignore [attr-defined] # list of references indirect_references = [ Reference( object_number=0, generation_number=65535, is_in_use=False, document=document, ) ] # check size assert "Size" in xref_stream assert isinstance(xref_stream["Size"], Decimal) # get size number_of_objects = int(xref_stream["Size"]) # index index = [] if "Index" in xref_stream: index = xref_stream["Index"] assert isinstance(index, List) assert len(index) % 2 == 0 assert isinstance(index[0], Decimal) assert isinstance(index[1], Decimal) else: index = [Decimal(0), Decimal(number_of_objects)] # apply filters xref_stream = decode_stream(xref_stream) # read every range specified in \Index xref_stream_decoded_bytes = xref_stream["DecodedBytes"] for idx in range(0, len(index), 2): start = int(index[idx]) length = int(index[idx + 1]) bptr = 0 for i in range(0, length): # object number object_number = start + i # read type type = 1 if widths[0] > 0: type = 0 for j in range(0, widths[0]): type = (type << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 2 field2 = 0 for j in range(0, widths[1]): field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 3 field3 = 0 for j in range(0, widths[2]): field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # check type assert type in [0, 1, 2] pdf_indirect_reference = None if type == 0: # type :The type of this entry, which shall be 0. Type 0 entries define # the linked list of free objects (corresponding to f entries in a # cross-reference table). # field2 : The object number of the next free object # field3 : The generation number to use if this object number is used again pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, is_in_use=False, ) if type == 1: # Type : The type of this entry, which shall be 1. Type 1 entries define # objects that are in use but are not compressed (corresponding # to n entries in a cross-reference table). # field2 : The byte offset of the object, starting from the beginning of the # file. # field3 : The generation number of the object. Default value: 0. pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, ) if type == 2: # Type : The type of this entry, which shall be 2. Type 2 entries define # compressed objects. # field2 : The object number of the object stream in which this object is # stored. (The generation number of the object stream shall be # implicitly 0.) # field3 : The index of this object within the object stream. pdf_indirect_reference = Reference( document=document, object_number=object_number, generation_number=0, parent_stream_object_number=field2, index_in_parent_stream=field3, ) assert pdf_indirect_reference is not None # append existing_indirect_ref = next( iter([ x for x in indirect_references if x.object_number is not None and x.object_number == Decimal(object_number) ]), None, ) ref_is_in_reading_state = ( existing_indirect_ref is not None and existing_indirect_ref.is_in_use and existing_indirect_ref.generation_number == pdf_indirect_reference.generation_number) ref_is_first_encountered = existing_indirect_ref is None or ( not ref_is_in_reading_state and existing_indirect_ref.document is None) if ref_is_first_encountered: assert pdf_indirect_reference is not None indirect_references.append(pdf_indirect_reference) elif ref_is_in_reading_state: assert existing_indirect_ref is not None assert pdf_indirect_reference is not None existing_indirect_ref.index_in_parent_stream = ( pdf_indirect_reference.index_in_parent_stream) existing_indirect_ref.parent_stream_object_number = ( pdf_indirect_reference.parent_stream_object_number) # add section for r in indirect_references: self.append(r) # initialize trailer self[Name("Trailer")] = Dictionary() for k, v in xref_stream.items(): self[Name("Trailer")][k] = v self[Name("Trailer")].set_parent(self[Name("Trailer")]) # return return self
def __deepcopy__(self, memodict={}) -> "Font": out: Font = self._empty_copy() # Type out[Name("Type")] = Name("Font") # BaseFont out[Name("BaseFont")] = Name(str(self["BaseFont"])) # FirstChar if "FirstChar" in self: out[Name("FirstChar")] = self["FirstChar"] # LastChar if "LastChar" in self: out[Name("LastChar")] = self["LastChar"] # Widths if "Widths" in self: out[Name("Widths")] = List() for k in self["Widths"]: out[Name("Widths")].append(k) # FontDescriptor if "FontDescriptor" in self: out[Name("FontDescriptor")] = self._copy_font_descriptor(self["FontDescriptor"]) # Encoding if "Encoding" in self: # Name if isinstance(self["Encoding"], Name): out[Name("Encoding")] = Name(str(self["Encoding"])) # Dictionary if isinstance(self["Encoding"], Dictionary): out[Name("Encoding")] = Dictionary() out["Encoding"][Name("Type")] = Name("Encoding") if "BaseEncoding" in self["Encoding"]: out["Encoding"][Name("BaseEncoding")] = Name( str(self["Encoding"]["BaseEncoding"]) ) if "Differences" in self["Encoding"]: l = List() for x in self["Encoding"]["Differences"]: l.append(x) out["Encoding"][Name("Differences")] = l # ToUnicode if "ToUnicode" in self: out[Name("ToUnicode")] = copy.deepcopy(self["ToUnicode"]) # FontBBox if "FontBBox" in self: out[Name("FontBBox")] = List() for x in self["FontBBox"]: out["FontBBox"].append(x) # FontMatrix if "FontMatrix" in self: out[Name("FontMatrix")] = List() for x in self["FontMatrix"]: out["FontMatrix"].append(x) # CharProcs # Resources # CIDSystemInfo if "CIDSystemInfo" in self: out[Name("CIDSystemInfo")] = Dictionary() out["CIDSystemInfo"][Name("Registry")] = self["CIDSystemInfo"]["Registry"] out["CIDSystemInfo"][Name("Ordering")] = self["CIDSystemInfo"]["Ordering"] out["CIDSystemInfo"][Name("Supplement")] = self["CIDSystemInfo"][ "Supplement" ] # DW if "DW" in self: out[Name("DW")] = self["DW"] # W if "W" in self: out[Name("W")] = List() for x in self["W"]: if isinstance(x, pDecimal): out["W"].append(x) if isinstance(x, List): l = List() for y in x: l.append(y) out["W"].append(l) # DescendantFonts if "DescendantFonts" in self: out[Name("DescendantFonts")] = List() out["DescendantFonts"].append( self["DescendantFonts"][0].__deepcopy__(memodict) ) # DW2 if "DW2" in self: out[Name("DW2")] = List() for x in self["DW2"]: out["DW2"].append(x) # W2 # CIDToGIDMap # default for k,v in self.items(): if k not in out: out[k] = copy.deepcopy(v, memodict) return out