Esempi in Python per Dictionary, esempi in Python per ptext.io.read.types.Dictionary

Esempio n. 1

0

Mostra file

 def insert_page(
     self,
     page: Page,
     index: typing.Optional[int] = None
 ) -> "Document":  # type: ignore [name-defined]
     """
     This method appends a page (from another Document) to this Document at a given index
     """
     # build XRef
     if "XRef" not in self:
         self[Name("XRef")] = PlainTextXREF()
         self[Name("XRef")].set_parent(self)
     # build Trailer
     if "Trailer" not in self["XRef"]:
         self["XRef"][Name("Trailer")] = Dictionary()
         self["XRef"][Name("Size")] = Decimal(0)
         self["XRef"]["Trailer"].set_parent(self["XRef"])
     # build Root
     if "Root" not in self["XRef"]["Trailer"]:
         self["XRef"]["Trailer"][Name("Root")] = Dictionary()
         self["XRef"]["Trailer"]["Root"].set_parent(self["XRef"]["Trailer"])
     # build Pages
     if "Pages" not in self["XRef"]["Trailer"]["Root"]:
         self["XRef"]["Trailer"][Name("Root")][Name("Pages")] = Dictionary()
         self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name(
             "Count")] = Decimal(0)
         self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name(
             "Kids")] = List()
         self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name(
             "Type")] = Name("Pages")
         self["XRef"]["Trailer"]["Root"]["Pages"].set_parent(
             self["XRef"]["Trailer"]["Root"])
         self["XRef"]["Trailer"]["Root"]["Pages"]["Kids"].set_parent(
             self["XRef"]["Trailer"]["Root"]["Pages"])
     # update /Kids
     kids = self["XRef"]["Trailer"]["Root"]["Pages"]["Kids"]
     assert kids is not None
     assert isinstance(kids, List)
     if index is None:
         index = len(kids)
     kids.insert(index, page)
     # update /Count
     prev_count = self["XRef"]["Trailer"]["Root"]["Pages"]["Count"]
     self["XRef"]["Trailer"]["Root"]["Pages"][Name("Count")] = Decimal(
         prev_count + 1)
     # set /Parent
     page[Name("Parent")] = self["XRef"]["Trailer"]["Root"]["Pages"]
     page.set_parent(kids)  # type: ignore [attr-defined]
     # return
     return self

Esempio n. 2

0

Mostra file

File: high_level_tokenizer.py Progetto: tieugene/ptext-release

    def read_dictionary(self) -> Dictionary:
        """
        This function processes the next tokens and returns a Dictionary.
        It fails and throws various errors if the next tokens do not represent a Dictionary.
        """
        token = self.next_non_comment_token()
        assert token is not None
        assert token.token_type == TokenType.START_DICT

        out_dict = Dictionary()
        while True:

            # attempt to read name token
            token = self.next_non_comment_token()
            assert token is not None
            if token.token_type == TokenType.END_DICT:
                break
            assert token.token_type == TokenType.NAME

            # store name
            name = Name(token.text[1:])

            # attempt to read value
            value = self.read_object()
            assert value is not None

            # store in dict object
            if name is not None:
                out_dict[name] = value

        return out_dict

Esempio n. 3

0

Mostra file

    def _test_document(self, file) -> bool:

        # create output directory if it does not exist yet
        if not self.output_dir.exists():
            self.output_dir.mkdir()

        doc = None
        with open(file, "rb") as pdf_file_handle:
            doc = None
            with open(file, "rb") as pdf_file_handle:
                doc = PDF.loads(pdf_file_handle)

        if "XRef" not in doc:
            return False
        if "Trailer" not in doc["XRef"]:
            return False

        if "Info" not in doc["XRef"]["Trailer"]:
            doc["XRef"]["Trailer"][Name("Info")] = Dictionary()

        # change producer
        doc["XRef"]["Trailer"]["Info"][Name("Producer")] = String("pText")

        # determine output location
        out_file = self.output_dir / (file.stem + "_out.pdf")
        with open(out_file, "wb") as pdf_file_handle:
            PDF.dumps(pdf_file_handle, doc)

        return True

Esempio n. 4

0

Mostra file

File: plaintext_xref.py Progetto: tieugene/ptext-release

    def _read_trailer(
        self,
        src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO],
        tok: HighLevelTokenizer,
    ) -> Dictionary:

        # return None if there is no trailer
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text != "trailer":
            return Dictionary()

        # if there is a keyword "trailer" the next token should be TokenType.START_DICT
        token = tok.next_non_comment_token()
        assert token is not None
        assert token.token_type == TokenType.START_DICT

        # go back 2 chars "<<"
        src.seek(-2, io.SEEK_CUR)

        # read dictionary as trailer
        trailer_dict = tok.read_dictionary()

        # process startxref
        token = tok.next_non_comment_token()
        assert token is not None
        assert token.token_type == TokenType.OTHER
        assert token.text == "startxref"

        # return
        return trailer_dict

Esempio n. 5

0

Mostra file

File: paragraph.py Progetto: gbtami/ptext-release

    def _get_font_resource_name(self, font: Font, page: Page):
        # create resources if needed
        if "Resources" not in page:
            page[Name("Resources")] = Dictionary().set_parent(page)  # type: ignore [attr-defined]
        if "Font" not in page["Resources"]:
            page["Resources"][Name("Font")] = Dictionary()

        # insert font into resources
        font_resource_name = [
            k for k, v in page["Resources"]["Font"].items() if v == font
        ]
        if len(font_resource_name) > 0:
            return font_resource_name[0]
        else:
            font_index = len(page["Resources"]["Font"]) + 1
            page["Resources"]["Font"][Name("F%d" % font_index)] = font
            return Name("F%d" % font_index)

Esempio n. 6

0

Mostra file

File: image.py Progetto: tieugene/ptext-release

    def _get_image_resource_name(self, image: PILImage, page: Page):
        # create resources if needed
        if "Resources" not in page:
            page[Name("Resources")] = Dictionary().set_parent(
                page)  # type: ignore [attr-defined]
        if "XObject" not in page["Resources"]:
            page["Resources"][Name("XObject")] = Dictionary()

        # insert font into resources
        image_resource_name = [
            k for k, v in page["Resources"]["XObject"].items() if v == image
        ]
        if len(image_resource_name) > 0:
            return image_resource_name[0]
        else:
            image_index = len(page["Resources"]["XObject"]) + 1
            page["Resources"]["XObject"][Name("Im%d" % image_index)] = image
            return Name("Im%d" % image_index)

Esempio n. 7

0

Mostra file

File: write_pdf_transformer.py Progetto: tieugene/ptext-release

    def transform(
        self,
        object_to_transform: Any,
        context: Optional[WriteTransformerContext] = None,
    ):
        """
        This method writes a Document object to a byte stream
        """
        # write header
        assert context is not None
        assert context.destination is not None

        context.destination.write(b"%PDF-1.7\n")
        context.destination.write(b"%")
        context.destination.write(bytes([226, 227, 207, 211]))
        context.destination.write(b"\n")

        # invalidate all references
        WritePDFTransformer._invalidate_all_references(object_to_transform)

        # create Info dictionary if needed
        if "Info" not in object_to_transform["XRef"]["Trailer"]:
            object_to_transform["XRef"]["Trailer"][Name("Info")] = Dictionary()

        # set /ID
        random_id = HexadecimalString("%032x" % random.randrange(16**32))
        if "ID" not in object_to_transform["XRef"]["Trailer"]:
            object_to_transform["XRef"]["Trailer"][Name("ID")] = List(
            ).set_can_be_referenced(  # type: ignore [attr-defined]
                False)
            object_to_transform["XRef"]["Trailer"]["ID"].append(random_id)
            object_to_transform["XRef"]["Trailer"]["ID"].append(random_id)
        else:
            object_to_transform["XRef"]["Trailer"]["ID"][1] = random_id
        object_to_transform["XRef"]["Trailer"]["ID"].set_can_be_referenced(
            False)

        # set CreationDate
        modification_date = WritePDFTransformer._timestamp_to_str()
        if "CreationDate" not in object_to_transform["XRef"]["Trailer"][Name(
                "Info")]:
            object_to_transform["XRef"]["Trailer"][Name("Info")][Name(
                "CreationDate")] = String(modification_date)

        # set ModDate
        object_to_transform["XRef"]["Trailer"]["Info"][Name(
            "ModDate")] = String(modification_date)

        # set Producer
        object_to_transform["XRef"]["Trailer"]["Info"][Name(
            "Producer")] = String("pText")

        # transform XREF
        self.get_root_transformer().transform(object_to_transform["XRef"],
                                              context)

Esempio n. 8

0

Mostra file

File: test_hash_types.py Progetto: tieugene/ptext-release

    def test_hash_types(self):

        obj0 = Dictionary()
        obj0[Name("Root")] = Reference(object_number=10)
        obj0[Name("Marked")] = Boolean(True)

        obj1 = List()
        obj1.append(Name("Red"))
        obj1.append(Decimal(0.5))

        print(hash(obj1))

Esempio n. 9

0

Mostra file

File: write_dictionary_transformer.py Progetto: gbtami/ptext-release

    def transform(
        self,
        object_to_transform: AnyPDFType,
        context: Optional[WriteTransformerContext] = None,
    ):
        """
        This method writes a Dictionary to a byte stream
        """
        assert isinstance(object_to_transform, Dictionary)
        assert context is not None
        assert context.destination is not None
        assert context.destination

        # avoid resolving objects twice
        object_ref: typing.Optional[
            Reference] = object_to_transform.get_reference(
            )  # type: ignore [attr-defined]
        if object_ref is not None and object_ref in context.resolved_references:
            assert object_ref is not None
            assert object_ref.object_number is not None
            logger.debug(
                "skip writing object %d %d R (already resolved)" %
                (object_ref.object_number, object_ref.generation_number or 0))
            return

        # output value
        out_value = Dictionary()

        # objects to turn into reference
        queue: typing.List[AnyPDFType] = []
        for k, v in object_to_transform.items():
            if (isinstance(v, Dictionary) or isinstance(v, List)
                    or isinstance(v, Stream) or isinstance(v, Image)
                    or isinstance(v, Element)
                ) and v.can_be_referenced():  # type: ignore [union-attr]
                out_value[k] = self.get_reference(v, context)
                queue.append(v)
            else:
                out_value[k] = v

        # start object if needed
        started_object = False
        if object_ref is not None:
            assert object_ref.object_number is not None
            if object_ref.object_number is not None and object_ref.byte_offset is None:
                started_object = True
                self.start_object(object_to_transform, context)
            context.resolved_references.append(object_ref)

        # write dictionary at current location
        context.destination.write(bytes("<<", "latin1"))
        N = len(out_value.items())
        for i, (k, v) in enumerate(out_value.items()):
            self.get_root_transformer().transform(k, context)
            context.destination.write(bytes(" ", "latin1"))
            self.get_root_transformer().transform(v, context)
            if i != N - 1:
                context.destination.write(bytes(" ", "latin1"))
        context.destination.write(bytes(">>\n", "latin1"))

        # end object if needed
        if started_object:
            self.end_object(object_to_transform, context)

        for e in queue:
            self.get_root_transformer().transform(e, context)

        # return
        return out_value

Esempio n. 10

0

Mostra file

File: document.py Progetto: gbtami/ptext-release

    def add_outline(
        self,
        text: str,
        level: int,
        destination_type: DestinationType,
        page_nr: int,
        top: typing.Optional[Decimal] = None,
        right: typing.Optional[Decimal] = None,
        bottom: typing.Optional[Decimal] = None,
        left: typing.Optional[Decimal] = None,
        zoom: typing.Optional[Decimal] = None,
    ) -> "Document":
        """
        A PDF document may contain a document outline that the conforming reader may display on the screen,
        allowing the user to navigate interactively from one part of the document to another. The outline consists of a
        tree-structured hierarchy of outline items (sometimes called bookmarks), which serve as a visual table of
        contents to display the document’s structure to the user.
        This function adds an outline to this Document
        """
        destination = List().set_can_be_referenced(
            False)  # type: ignore [attr-defined]
        destination.append(Decimal(page_nr))
        destination.append(destination_type.value)
        if destination_type == DestinationType.X_Y_Z:
            assert (left is not None and bottom is None and right is None
                    and top is not None and zoom is not None)
            destination.append(Decimal(left))
            destination.append(Decimal(top))
            destination.append(Decimal(zoom))
        if destination_type == DestinationType.FIT:
            assert (left is None and bottom is None and right is None
                    and top is None and zoom is None)
        if destination_type == DestinationType.FIT_H:
            assert (left is None and bottom is None and right is None
                    and top is not None and zoom is None)
            destination.append(Decimal(top))
        if destination_type == DestinationType.FIT_V:
            assert (left is not None and bottom is None and right is None
                    and top is None and zoom is None)
            destination.append(Decimal(left))
        if destination_type == DestinationType.FIT_R:
            assert (left is not None and bottom is not None
                    and right is not None and top is not None and zoom is None)
            destination.append(Decimal(left))
            destination.append(Decimal(bottom))
            destination.append(Decimal(right))
            destination.append(Decimal(top))
        if destination_type == DestinationType.FIT_B_H:
            assert (left is None and bottom is None and right is None
                    and top is not None and zoom is None)
            destination.append(Decimal(top))
        if destination_type == DestinationType.FIT_B_V:
            assert (left is not None and bottom is None and right is None
                    and top is None and zoom is None)
            destination.append(Decimal(left))

        # add \Outlines entry in \Root
        if "Outlines" not in self["XRef"]["Trailer"]["Root"]:
            outline_dictionary: Dictionary = Dictionary()
            self["XRef"]["Trailer"]["Root"][Name(
                "Outlines")] = outline_dictionary
            outline_dictionary.set_parent(  # type: ignore [attr-defined]
                self["XRef"]["Trailer"]["Root"][Name("Outlines")])
            outline_dictionary[Name("Type")] = Name("Outlines")
            outline_dictionary[Name("Count")] = Decimal(0)

        # create entry
        outline = Dictionary()
        outline[Name("Dest")] = destination
        outline[Name("Parent")] = None
        outline[Name("Title")] = String(text)

        # get \Outlines
        outline_dictionary = self["XRef"]["Trailer"]["Root"]["Outlines"]

        # if everything is empty, add the new entry as the only entry
        if "First" not in outline_dictionary or "Last" not in outline_dictionary:
            outline_dictionary[Name("First")] = outline
            outline_dictionary[Name("Last")] = outline
            outline_dictionary[Name("Count")] = Decimal(1)
            outline[Name("Parent")] = outline_dictionary
            return self

        # helper function to make DFS easier
        def _children(x: Dictionary):
            if "First" not in x:
                return []
            children = [x["First"]]
            while children[-1] != x["Last"]:
                children.append(children[-1]["Next"])
            return children

        # DFS outline(s)
        outlines_done: typing.List[typing.Tuple[int, Dictionary]] = []
        outlines_todo: typing.List[typing.Tuple[int, Dictionary]] = [
            (-1, outline_dictionary)
        ]
        while len(outlines_todo) > 0:
            t = outlines_todo[0]
            outlines_done.append(t)
            outlines_todo.pop(0)
            for c in _children(t[1]):
                outlines_todo.append((t[0] + 1, c))

        # find parent
        parent = [x[1] for x in outlines_done if x[0] == level - 1][-1]

        # update sibling-linking
        if "Last" in parent:
            sibling = parent["Last"]
            sibling[Name("Next")] = outline

        # update parent-linking
        outline[Name("Parent")] = parent
        if "First" not in parent:
            parent[Name("First")] = outline
        if "Count" not in parent:
            parent[Name("Count")] = Decimal(0)
        parent[Name("Last")] = outline

        # update count
        outline_to_update_count = parent
        while outline_to_update_count:
            outline_to_update_count[Name("Count")] = Decimal(
                outline_to_update_count["Count"] + Decimal(1))
            if "Parent" in outline_to_update_count:
                outline_to_update_count = outline_to_update_count["Parent"]
            else:
                break

        return self

Esempio n. 11

0

Mostra file

File: write_xref_transformer.py Progetto: tieugene/ptext-release

    def transform(
        self,
        object_to_transform: AnyPDFType,
        context: Optional[WriteTransformerContext] = None,
    ):
        """
        This method writes an XREF to a byte stream
        """
        assert isinstance(object_to_transform, XREF)
        assert "Trailer" in object_to_transform
        assert isinstance(object_to_transform["Trailer"], Dictionary)
        assert context is not None
        assert context.destination is not None

        # Transform the Trailer dictionary (replacing objects by references)
        # we do this upfront because the normal write_dictionary_transformer will write the dictionary first,
        # and the references afterwards. This would cause the \Trailer dictionary to not be the last.
        trailer_out = Dictionary()
        # /Root
        trailer_out[Name("Root")] = self.get_reference(
            object_to_transform["Trailer"]["Root"], context)
        # /Info
        if "Info" in object_to_transform["Trailer"]:
            trailer_out[Name("Info")] = self.get_reference(
                object_to_transform["Trailer"]["Info"], context)
        # /Size
        if ("Trailer" in object_to_transform
                and "Size" in object_to_transform["Trailer"]):
            trailer_out[Name("Size")] = object_to_transform["Trailer"]["Size"]
        else:
            trailer_out[Name("Size")] = Decimal(
                0)  # we'll recalculate this later anyway
        # /ID
        if "ID" in object_to_transform["Trailer"]:
            trailer_out[Name("ID")] = object_to_transform["Trailer"]["ID"]

        # write /Root object
        self.get_root_transformer().transform(
            object_to_transform["Trailer"]["Root"], context)

        # write /Info object
        if "Info" in object_to_transform["Trailer"]:
            self.get_root_transformer().transform(
                object_to_transform["Trailer"]["Info"], context)

        # write /XREF
        start_of_xref = context.destination.tell()
        context.destination.write(bytes("xref\n", "latin1"))
        for section in self._section_xref(context):
            context.destination.write(
                bytes("%d %d\n" % (section[0].object_number, len(section)),
                      "latin1"))
            for r in section:
                if r.is_in_use:
                    context.destination.write(
                        bytes("{0:010d} 00000 n\r\n".format(r.byte_offset),
                              "latin1"))
                else:
                    context.destination.write(
                        bytes("{0:010d} 00000 f\r\n".format(r.byte_offset),
                              "latin1"))

        # update /Size
        trailer_out[Name("Size")] = Decimal(
            sum([len(v)
                 for k, v in context.indirect_objects_by_hash.items()]) + 1)

        # write /Trailer
        context.destination.write(bytes("trailer\n", "latin1"))
        self.get_root_transformer().transform(trailer_out, context)
        context.destination.write(bytes("startxref\n", "latin1"))

        # write byte offset of last cross-reference section
        context.destination.write(bytes(str(start_of_xref) + "\n", "latin1"))

        # write EOF
        context.destination.write(bytes("%%EOF", "latin1"))

Esempio n. 12

0

Mostra file

File: document.py Progetto: gbtami/ptext-release

    def append_embedded_file(self,
                             file_name: str,
                             file_bytes: bytes,
                             apply_compression: bool = True) -> "Document":
        """
        If a PDF file contains file specifications that refer to an external file and the PDF file is archived or transmitted,
        some provision should be made to ensure that the external references will remain valid. One way to do this is to
        arrange for copies of the external files to accompany the PDF file. Embedded file streams (PDF 1.3) address
        this problem by allowing the contents of referenced files to be embedded directly within the body of the PDF
        file. This makes the PDF file a self-contained unit that can be stored or transmitted as a single entity. (The
        embedded files are included purely for convenience and need not be directly processed by any conforming reader.)
        This method embeds a file (specified by its name and bytes) into this Document
        """
        assert "XRef" in self
        assert "Trailer" in self["XRef"]
        assert "Root" in self["XRef"]["Trailer"]
        root = self["XRef"]["Trailer"]["Root"]

        # set up /Names dictionary
        if "Names" not in root:
            root[Name("Names")] = Dictionary()
        names = root["Names"]

        # set up /EmbeddedFiles
        if "EmbeddedFiles" not in names:
            names[Name("EmbeddedFiles")] = Dictionary()
            names["EmbeddedFiles"][Name("Kids")] = List()

        # find parent
        parent = names["EmbeddedFiles"]
        while "Kids" in parent:
            for k in parent["Kids"]:
                lower_limit = str(k["Limits"][0])
                upper_limit = str(k["Limits"][1])
                if lower_limit == upper_limit:
                    continue
                if lower_limit < file_name < upper_limit:
                    parent = k
                    break
            break

        # add new child
        if (len([
                x for x in parent["Kids"]
                if x["Limits"][0] == x["Limits"][1] == file_name
        ]) == 0):

            kid = Dictionary()
            kid[Name("F")] = String(file_name)
            kid[Name("Type")] = Name("Filespec")
            kid[Name("Limits")] = List()
            for _ in range(0, 2):
                kid["Limits"].append(String(file_name))

            # build leaf \Names dictionary
            names = List()
            names.append(String(file_name))
            kid[Name("Names")] = names

            # build actual file stream
            stream = Stream()
            stream[Name("Type")] = Name("EmbeddedFile")
            stream[Name("DecodedBytes")] = file_bytes
            if not apply_compression:
                stream[Name("Bytes")] = file_bytes
            else:
                stream[Name("Bytes")] = zlib.compress(
                    stream[Name("DecodedBytes")], 9)
                stream[Name("Filter")] = Name("FlateDecode")
            stream[Name("Length")] = Decimal(len(stream[Name("Bytes")]))

            # build leaf \Filespec dictionary
            file_spec = Dictionary()
            file_spec[Name("EF")] = Dictionary()
            file_spec["EF"][Name("F")] = stream
            file_spec[Name("F")] = String(file_name)
            file_spec[Name("Type")] = Name("Filespec")
            names.append(file_spec)

            # append
            parent["Kids"].append(kid)

        # change existing child
        else:
            kid = [
                x for x in parent["Kids"]
                if x["Limits"][0] == x["Limits"][1] == file_name
            ][0]
            # TODO

        # return
        return self

Esempio n. 13

0

Mostra file

    def add_outline(
        self,
        text: str,
        level: int,
        destination_type: DestinationType,
        page_nr: int,
        top: typing.Optional[Decimal] = None,
        right: typing.Optional[Decimal] = None,
        bottom: typing.Optional[Decimal] = None,
        left: typing.Optional[Decimal] = None,
        zoom: typing.Optional[Decimal] = None,
    ) -> "Document":

        destination = List().set_can_be_referenced(
            False)  # type: ignore [attr-defined]
        destination.append(Decimal(page_nr))
        destination.append(destination_type.value)
        if destination_type == DestinationType.X_Y_Z:
            assert (left is not None and bottom is None and right is None
                    and top is not None and zoom is not None)
            destination.append(Decimal(left))
            destination.append(Decimal(top))
            destination.append(Decimal(zoom))
        if destination_type == DestinationType.FIT:
            assert (left is None and bottom is None and right is None
                    and top is None and zoom is None)
        if destination_type == DestinationType.FIT_H:
            assert (left is None and bottom is None and right is None
                    and top is not None and zoom is None)
            destination.append(Decimal(top))
        if destination_type == DestinationType.FIT_V:
            assert (left is not None and bottom is None and right is None
                    and top is None and zoom is None)
            destination.append(Decimal(left))
        if destination_type == DestinationType.FIT_R:
            assert (left is not None and bottom is not None
                    and right is not None and top is not None and zoom is None)
            destination.append(Decimal(left))
            destination.append(Decimal(bottom))
            destination.append(Decimal(right))
            destination.append(Decimal(top))
        if destination_type == DestinationType.FIT_B_H:
            assert (left is None and bottom is None and right is None
                    and top is not None and zoom is None)
            destination.append(Decimal(top))
        if destination_type == DestinationType.FIT_B_V:
            assert (left is not None and bottom is None and right is None
                    and top is None and zoom is None)
            destination.append(Decimal(left))

        # add \Outlines entry in \Root
        if "Outlines" not in self["XRef"]["Trailer"]["Root"]:
            outline_dictionary: Dictionary = Dictionary()
            self["XRef"]["Trailer"]["Root"][Name(
                "Outlines")] = outline_dictionary
            outline_dictionary.set_parent(  # type: ignore [attr-defined]
                self["XRef"]["Trailer"]["Root"][Name("Outlines")])
            outline_dictionary[Name("Type")] = Name("Outlines")
            outline_dictionary[Name("Count")] = Decimal(0)

        # create entry
        outline = Dictionary()
        outline[Name("Dest")] = destination
        outline[Name("Parent")] = None
        outline[Name("Title")] = String(text)

        # get \Outlines
        outline_dictionary = self["XRef"]["Trailer"]["Root"]["Outlines"]

        # if everything is empty, add the new entry as the only entry
        if "First" not in outline_dictionary or "Last" not in outline_dictionary:
            outline_dictionary[Name("First")] = outline
            outline_dictionary[Name("Last")] = outline
            outline_dictionary[Name("Count")] = Decimal(1)
            outline[Name("Parent")] = outline_dictionary
            return self

        # helper function to make DFS easier
        def _children(x: Dictionary):
            if "First" not in x:
                return []
            children = [x["First"]]
            while children[-1] != x["Last"]:
                children.append(children[-1]["Next"])
            return children

        # DFS outline(s)
        outlines_done: typing.List[typing.Tuple[int, Dictionary]] = []
        outlines_todo: typing.List[typing.Tuple[int, Dictionary]] = [
            (-1, outline_dictionary)
        ]
        while len(outlines_todo) > 0:
            t = outlines_todo[0]
            outlines_done.append(t)
            outlines_todo.pop(0)
            for c in _children(t[1]):
                outlines_todo.append((t[0] + 1, c))

        # find parent
        parent = [x[1] for x in outlines_done if x[0] == level - 1][-1]

        # update sibling-linking
        if "Last" in parent:
            sibling = parent["Last"]
            sibling[Name("Next")] = outline

        # update parent-linking
        outline[Name("Parent")] = parent
        if "First" not in parent:
            parent[Name("First")] = outline
        if "Count" not in parent:
            parent[Name("Count")] = Decimal(0)
        parent[Name("Last")] = outline

        # update count
        outline_to_update_count = parent
        while outline_to_update_count:
            outline_to_update_count[Name("Count")] = Decimal(
                outline_to_update_count["Count"] + Decimal(1))
            if "Parent" in outline_to_update_count:
                outline_to_update_count = outline_to_update_count["Parent"]
            else:
                break

        return self

Esempio n. 14

0

Mostra file

File: write_stream_transformer.py Progetto: gbtami/ptext-release

    def transform(
        self,
        object_to_transform: AnyPDFType,
        context: Optional[WriteTransformerContext] = None,
    ):
        """
        This method writes a Stream to a byte stream
        """
        assert context is not None
        assert context.destination is not None
        assert isinstance(object_to_transform, Stream)

        # avoid resolving objects twice
        object_ref: typing.Optional[
            Reference] = object_to_transform.get_reference(
            )  # type: ignore [attr-defined]
        if object_ref is not None and object_ref in context.resolved_references:
            assert object_ref is not None
            assert object_ref.object_number is not None
            logger.debug(
                "skip writing object %d %d R (already resolved)" %
                (object_ref.object_number, object_ref.generation_number or 0))
            return

        # start object if needed
        started_object = False
        if object_ref is not None:
            assert object_ref.object_number is not None
            if object_ref.object_number is not None and object_ref.byte_offset is None:
                started_object = True
                self.start_object(object_to_transform, context)
            context.resolved_references.append(object_ref)

        # build stream dictionary
        stream_dictionary = Dictionary()

        # objects to turn into reference
        queue: typing.List[AnyPDFType] = []
        for k, v in object_to_transform.items():
            if k in ["Bytes", "DecodedBytes"]:
                continue
            if (isinstance(v, Dictionary) or isinstance(v, List)
                    or isinstance(v, Stream)
                ) and v.can_be_referenced():  # type: ignore [union-attr]
                stream_dictionary[k] = self.get_reference(v, context)
                queue.append(v)
            else:
                stream_dictionary[k] = v

        # if self.compression_level == 0, remove \Filter
        if context.compression_level == 0 and Name(
                "Filter") in stream_dictionary:
            stream_dictionary.pop(Name("Filter"))

        # handle compression
        if "DecodedBytes" in object_to_transform:
            if context.compression_level == 0:
                bts = object_to_transform["DecodedBytes"]
            else:
                bts = zlib.compress(object_to_transform["DecodedBytes"],
                                    context.compression_level)
            stream_dictionary[Name("Length")] = pDecimal(len(bts))
        else:
            assert "Bytes" in object_to_transform
            bts = object_to_transform["Bytes"]

        # write stream dictionary
        self.get_root_transformer().transform(stream_dictionary, context)

        # write "stream"
        context.destination.write(bytes("stream\n", "latin1"))

        # write bytes
        context.destination.write(bts)

        # write "endstream"
        context.destination.write(bytes("\nendstream\n", "latin1"))

        # end object if needed
        if started_object:
            self.end_object(object_to_transform, context)

        for e in queue:
            self.get_root_transformer().transform(e, context)

Esempio n. 15

0

Mostra file

    def true_type_font_from_file(path_to_font_file: Path) -> "TrueTypeFont":
        """
        This function returns the PDF TrueTypeFont object for a given TTF file
        """
        assert path_to_font_file.exists()
        assert path_to_font_file.name.endswith(".ttf")

        font_file_bytes: typing.Optional[bytes] = None
        with open(path_to_font_file, "rb") as ffh:
            font_file_bytes = ffh.read()
        assert font_file_bytes

        # read file
        ttf_font_file = TTFont(path_to_font_file)

        # build font
        font: TrueTypeFont = TrueTypeFont()
        font_name: str = str(
            [
                x for x in ttf_font_file["name"].names
                if x.platformID == 3 and x.nameID == 1
            ][0].string,
            "latin1",
        )
        font_name = "".join([
            x for x in font_name if x.lower() in "abcdefghijklmnopqrstuvwxyz"
        ])

        font[Name("Name")] = Name(font_name)
        font[Name("BaseFont")] = Name(font_name)

        cmap: typing.Optional[typing.Dict[int,
                                          str]] = ttf_font_file.getBestCmap()
        cmap_reverse: typing.Dict[str, int] = {}
        for k, v in cmap.items():
            if v in cmap_reverse:
                cmap_reverse[v] = min(cmap_reverse[v], k)
            else:
                cmap_reverse[v] = k
        glyph_order: typing.List[str] = [
            x for x in ttf_font_file.glyphOrder if x in cmap_reverse
        ]

        # build widths
        units_per_em: pDecimal = pDecimal(ttf_font_file["head"].unitsPerEm)
        if cmap is not None:
            font[Name("FirstChar")] = pDecimal(0)
            font[Name("LastChar")] = pDecimal(len(glyph_order))
            font[Name("Widths")] = List()
            for glyph_name in glyph_order:
                w: pDecimal = (
                    pDecimal(ttf_font_file.getGlyphSet()[glyph_name].width) /
                    units_per_em) * pDecimal(1000)
                w = pDecimal(round(w, 2))
                font["Widths"].append(w)

        font[Name("FontDescriptor")] = Dictionary()
        font["FontDescriptor"][Name("Type")] = Name("FontDescriptor")
        font["FontDescriptor"][Name("FontName")] = String(font_name)
        font["FontDescriptor"][Name("FontStretch")] = Name("Normal")  # TODO
        font["FontDescriptor"][Name("FontWeight")] = pDecimal(400)  # TODO
        font["FontDescriptor"][Name("Flags")] = pDecimal(4)  # TODO
        font["FontDescriptor"][Name("FontBBox")] = List(
        ).set_can_be_referenced(  # type: ignore [attr-defined]
            False)  # TODO
        for _ in range(0, 4):
            font["FontDescriptor"]["FontBBox"].append(pDecimal(0))

        # fmt: off
        font["FontDescriptor"][Name("ItalicAngle")] = pDecimal(
            ttf_font_file["post"].italicAngle)
        font["FontDescriptor"][Name("Ascent")] = pDecimal(
            pDecimal(ttf_font_file["hhea"].ascent) / units_per_em *
            Decimal(1000))
        font["FontDescriptor"][Name("Descent")] = pDecimal(
            pDecimal(ttf_font_file["hhea"].descent) / units_per_em *
            Decimal(1000))
        font["FontDescriptor"][Name("CapHeight")] = pDecimal(0)  # TODO
        font["FontDescriptor"][Name("StemV")] = pDecimal(0)  # TODO
        # fmt: on

        font[Name("Encoding")] = Dictionary()
        font["Encoding"][Name("BaseEncoding")] = Name("WinAnsiEncoding")
        font["Encoding"][Name("Differences")] = List()
        for i in range(0, len(glyph_order)):
            font["Encoding"]["Differences"].append(pDecimal(i))
            font["Encoding"]["Differences"].append(Name(glyph_order[i]))

        # embed font file
        font_stream: Stream = Stream()
        font_stream[Name("Type")] = Name("Font")
        font_stream[Name("Subtype")] = Name("TrueType")
        font_stream[Name("Length")] = pDecimal(len(font_file_bytes))
        font_stream[Name("Length1")] = pDecimal(len(font_file_bytes))
        font_stream[Name("Filter")] = Name("FlateDecode")
        font_stream[Name("DecodedBytes")] = font_file_bytes
        font_stream[Name("Bytes")] = zlib.compress(font_file_bytes, 9)

        font["FontDescriptor"][Name("FontFile2")] = font_stream

        # return
        return font

Esempio n. 16

0

Mostra file

def decode_stream(s: Stream) -> Stream:
    """
    This function decodes a Stream, applying the filters specified in the Filter entry
    of its stream dictionary
    """
    assert isinstance(s, Stream)
    assert "Bytes" in s

    # determine filter(s) to apply
    filters: typing.List[str] = []
    if "Filter" in s:
        if isinstance(s["Filter"], List):
            filters = s["Filter"]
        else:
            filters = [s["Filter"]]

    decode_params: typing.List[Dictionary] = []
    if "DecodeParms" in s:
        if isinstance(s["DecodeParms"], List):
            decode_params = s["DecodeParms"]
        else:
            assert s["DecodeParms"] is not None
            assert isinstance(s["DecodeParms"], Dictionary)
            decode_params = [s["DecodeParms"]]
    else:
        decode_params = [Dictionary() for x in range(0, len(filters))]

    # apply filter(s)
    transformed_bytes = s["Bytes"]
    for filter_index, filter_name in enumerate(filters):
        # FLATE
        if filter_name in ["FlateDecode", "Fl"]:
            transformed_bytes = FlateDecode.decode(
                bytes_in=transformed_bytes,
                columns=int(decode_params[filter_index].get("Columns", Decimal(1))),
                predictor=int(decode_params[filter_index].get("Predictor", Decimal(1))),
                bits_per_component=int(
                    decode_params[filter_index].get("BitsPerComponent", Decimal(8))
                ),
            )
            continue

        # ASCII85
        if filter_name in ["ASCII85Decode"]:
            transformed_bytes = ASCII85Decode.decode(transformed_bytes)
            continue

        # LZW
        if filter_name in ["LZWDecode"]:
            transformed_bytes = LZWDecode.decode(transformed_bytes)
            continue

        # RunLengthDecode
        if filter_name in ["RunLengthDecode"]:
            transformed_bytes = RunLengthDecode.decode(transformed_bytes)
            continue

        # unknown filter
        assert False, "Unknown /Filter %s" % filter_name

    # set DecodedBytes
    s[Name("DecodedBytes")] = transformed_bytes

    # set Type if not yet set
    # if "Type" not in s:
    #    s[Name("Type")] = Name("Stream")

    # return
    return s

Esempio n. 17

0

Mostra file

File: stream_xref.py Progetto: lzg440/ptext-release

    def read(
        self,
        io_source: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO],
        tokenizer: HighLevelTokenizer,
        initial_offset: Optional[int] = None,
    ) -> "XREF":
        """
        This method attempts to read a stream XREF from the given io_source.
        It will either throw an exception, or return this XREF
        """

        if initial_offset is not None:
            io_source.seek(initial_offset)
        else:
            self._seek_to_xref_token(io_source, tokenizer)

        xref_stream = tokenizer.read_object()
        assert isinstance(xref_stream, Stream)

        # check widths
        assert "W" in xref_stream
        assert all([
            isinstance(xref_stream["W"][x], Decimal)
            for x in range(0, len(xref_stream["W"]))
        ])
        # decode widths
        widths = [
            int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"]))
        ]
        total_entry_width = sum(widths)

        # parent
        document = self.get_root()  # type: ignore [attr-defined]

        # list of references
        indirect_references = [
            Reference(
                object_number=0,
                generation_number=65535,
                is_in_use=False,
                document=document,
            )
        ]

        # check size
        assert "Size" in xref_stream
        assert isinstance(xref_stream["Size"], Decimal)

        # get size
        number_of_objects = int(xref_stream["Size"])

        # index
        index = []
        if "Index" in xref_stream:
            index = xref_stream["Index"]
            assert isinstance(index, List)
            assert len(index) % 2 == 0
            assert isinstance(index[0], Decimal)
            assert isinstance(index[1], Decimal)
        else:
            index = [Decimal(0), Decimal(number_of_objects)]

        # apply filters
        xref_stream = decode_stream(xref_stream)

        # read every range specified in \Index
        xref_stream_decoded_bytes = xref_stream["DecodedBytes"]
        for idx in range(0, len(index), 2):
            start = int(index[idx])
            length = int(index[idx + 1])

            bptr = 0
            for i in range(0, length):

                # object number
                object_number = start + i

                # read type
                type = 1
                if widths[0] > 0:
                    type = 0
                    for j in range(0, widths[0]):
                        type = (type << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                        bptr += 1

                # read field 2
                field2 = 0
                for j in range(0, widths[1]):
                    field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # read field 3
                field3 = 0
                for j in range(0, widths[2]):
                    field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # check type
                assert type in [0, 1, 2]

                pdf_indirect_reference = None
                if type == 0:
                    # type      :The type of this entry, which shall be 0. Type 0 entries define
                    # the linked list of free objects (corresponding to f entries in a
                    # cross-reference table).
                    # field2    : The object number of the next free object
                    # field3    : The generation number to use if this object number is used again
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                        is_in_use=False,
                    )

                if type == 1:
                    # Type      : The type of this entry, which shall be 1. Type 1 entries define
                    # objects that are in use but are not compressed (corresponding
                    # to n entries in a cross-reference table).
                    # field2    : The byte offset of the object, starting from the beginning of the
                    # file.
                    # field3    : The generation number of the object. Default value: 0.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                    )

                if type == 2:
                    # Type      : The type of this entry, which shall be 2. Type 2 entries define
                    # compressed objects.
                    # field2    : The object number of the object stream in which this object is
                    # stored. (The generation number of the object stream shall be
                    # implicitly 0.)
                    # field3    : The index of this object within the object stream.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        generation_number=0,
                        parent_stream_object_number=field2,
                        index_in_parent_stream=field3,
                    )

                assert pdf_indirect_reference is not None

                # append
                existing_indirect_ref = next(
                    iter([
                        x for x in indirect_references
                        if x.object_number is not None
                        and x.object_number == Decimal(object_number)
                    ]),
                    None,
                )
                ref_is_in_reading_state = (
                    existing_indirect_ref is not None
                    and existing_indirect_ref.is_in_use
                    and existing_indirect_ref.generation_number
                    == pdf_indirect_reference.generation_number)
                ref_is_first_encountered = existing_indirect_ref is None or (
                    not ref_is_in_reading_state
                    and existing_indirect_ref.document is None)

                if ref_is_first_encountered:
                    assert pdf_indirect_reference is not None
                    indirect_references.append(pdf_indirect_reference)
                elif ref_is_in_reading_state:
                    assert existing_indirect_ref is not None
                    assert pdf_indirect_reference is not None
                    existing_indirect_ref.index_in_parent_stream = (
                        pdf_indirect_reference.index_in_parent_stream)
                    existing_indirect_ref.parent_stream_object_number = (
                        pdf_indirect_reference.parent_stream_object_number)

        # add section
        for r in indirect_references:
            self.append(r)

        # initialize trailer
        self[Name("Trailer")] = Dictionary()
        for k, v in xref_stream.items():
            self[Name("Trailer")][k] = v
        self[Name("Trailer")].set_parent(self[Name("Trailer")])

        # return
        return self

Esempio n. 18

0

Mostra file

 def __deepcopy__(self, memodict={}) -> "Font":
     out: Font = self._empty_copy()
     # Type
     out[Name("Type")] = Name("Font")
     # BaseFont
     out[Name("BaseFont")] = Name(str(self["BaseFont"]))
     # FirstChar
     if "FirstChar" in self:
         out[Name("FirstChar")] = self["FirstChar"]
     # LastChar
     if "LastChar" in self:
         out[Name("LastChar")] = self["LastChar"]
     # Widths
     if "Widths" in self:
         out[Name("Widths")] = List()
         for k in self["Widths"]:
             out[Name("Widths")].append(k)
     # FontDescriptor
     if "FontDescriptor" in self:
         out[Name("FontDescriptor")] = self._copy_font_descriptor(self["FontDescriptor"])
     # Encoding
     if "Encoding" in self:
         # Name
         if isinstance(self["Encoding"], Name):
             out[Name("Encoding")] = Name(str(self["Encoding"]))
         # Dictionary
         if isinstance(self["Encoding"], Dictionary):
             out[Name("Encoding")] = Dictionary()
             out["Encoding"][Name("Type")] = Name("Encoding")
             if "BaseEncoding" in self["Encoding"]:
                 out["Encoding"][Name("BaseEncoding")] = Name(
                     str(self["Encoding"]["BaseEncoding"])
                 )
             if "Differences" in self["Encoding"]:
                 l = List()
                 for x in self["Encoding"]["Differences"]:
                     l.append(x)
                 out["Encoding"][Name("Differences")] = l
     # ToUnicode
     if "ToUnicode" in self:
         out[Name("ToUnicode")] = copy.deepcopy(self["ToUnicode"])
     # FontBBox
     if "FontBBox" in self:
         out[Name("FontBBox")] = List()
         for x in self["FontBBox"]:
             out["FontBBox"].append(x)
     # FontMatrix
     if "FontMatrix" in self:
         out[Name("FontMatrix")] = List()
         for x in self["FontMatrix"]:
             out["FontMatrix"].append(x)
     # CharProcs
     # Resources
     # CIDSystemInfo
     if "CIDSystemInfo" in self:
         out[Name("CIDSystemInfo")] = Dictionary()
         out["CIDSystemInfo"][Name("Registry")] = self["CIDSystemInfo"]["Registry"]
         out["CIDSystemInfo"][Name("Ordering")] = self["CIDSystemInfo"]["Ordering"]
         out["CIDSystemInfo"][Name("Supplement")] = self["CIDSystemInfo"][
             "Supplement"
         ]
     # DW
     if "DW" in self:
         out[Name("DW")] = self["DW"]
     # W
     if "W" in self:
         out[Name("W")] = List()
         for x in self["W"]:
             if isinstance(x, pDecimal):
                 out["W"].append(x)
             if isinstance(x, List):
                 l = List()
                 for y in x:
                     l.append(y)
                 out["W"].append(l)
     # DescendantFonts
     if "DescendantFonts" in self:
         out[Name("DescendantFonts")] = List()
         out["DescendantFonts"].append(
             self["DescendantFonts"][0].__deepcopy__(memodict)
         )
     # DW2
     if "DW2" in self:
         out[Name("DW2")] = List()
         for x in self["DW2"]:
             out["DW2"].append(x)
     # W2
     # CIDToGIDMap
     # default
     for k,v in self.items():
         if k not in out:
             out[k] = copy.deepcopy(v, memodict)
     return out