Example #1
0
 def insert_page(
         self,
         page: Page,
         index: int = -1) -> "Document":  # type: ignore [name-defined]
     # build XRef
     if "XRef" not in self:
         self["XRef"] = PlainTextXREF()
     # build Trailer
     if "Trailer" not in self["XRef"]:
         self["XRef"]["Trailer"] = Dictionary()
         self["XRef"][Name("Size")] = Decimal(0)
     # build Root
     if "Root" not in self["XRef"]["Trailer"]:
         self["XRef"]["Trailer"][Name("Root")] = Dictionary()
     # build Pages
     if "Pages" not in self["XRef"]["Trailer"]["Root"]:
         self["XRef"]["Trailer"][Name("Root")][Name("Pages")] = Dictionary()
         self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name(
             "Count")] = Decimal(0)
         self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name(
             "Kids")] = List()
     # update /Kids
     kids = self["XRef"]["Trailer"]["Root"]["Pages"]["Kids"]
     assert kids is not None
     assert isinstance(kids, List)
     kids.insert(index, page)
     # update /Count
     prev_count = self["XRef"]["Trailer"]["Root"]["Pages"]["Count"]
     self["XRef"]["Trailer"]["Root"]["Pages"]["Count"] = Decimal(
         prev_count + 1)
     # return
     return self
Example #2
0
    def transform(
        self,
        object_to_transform: AnyPDFType,
        context: Optional[WriteTransformerContext] = None,
    ):
        assert isinstance(object_to_transform, Dictionary)
        assert context is not None
        assert context.destination is not None
        assert context.destination

        # output value
        out_value = Dictionary()

        # objects to turn into reference
        queue: typing.List[AnyPDFType] = []
        for k, v in object_to_transform.items():
            if (isinstance(v, Dictionary) or isinstance(v, List)
                    or isinstance(v, Stream) or isinstance(v, Image)
                ) and v.can_be_referenced():  # type: ignore [union-attr]
                out_value[k] = self.get_reference(v, context)
                queue.append(v)
            else:
                out_value[k] = v

        # start object if needed
        started_object = False
        ref = object_to_transform.get_reference(
        )  # type: ignore [attr-defined]
        if ref is not None:
            assert isinstance(ref, Reference)
            assert ref.object_number is not None
            if ref in context.duplicate_references:
                logger.debug("skip writing object %d %d R (duplicate)" %
                             (ref.object_number, ref.generation_number or 0))
                return
            if ref.object_number is not None and ref.byte_offset is None:
                started_object = True
                self.start_object(object_to_transform, context)
            context.duplicate_references.append(ref)

        # write dictionary at current location
        context.destination.write(bytes("<<", "latin1"))
        N = len(out_value.items())
        for i, (k, v) in enumerate(out_value.items()):
            self.get_root_transformer().transform(k, context)
            context.destination.write(bytes(" ", "latin1"))
            self.get_root_transformer().transform(v, context)
            if i != N - 1:
                context.destination.write(bytes(" ", "latin1"))
        context.destination.write(bytes(">>\n", "latin1"))

        # end object if needed
        if started_object:
            self.end_object(object_to_transform, context)

        for e in queue:
            self.get_root_transformer().transform(e, context)

        # return
        return out_value
    def test_document(self, file) -> bool:

        # create output directory if it does not exist yet
        if not self.output_dir.exists():
            self.output_dir.mkdir()

        doc = None
        with open(file, "rb") as pdf_file_handle:
            doc = PDF.loads(pdf_file_handle)

        if "XRef" not in doc:
            return False
        if "Trailer" not in doc["XRef"]:
            return False

        if "Info" not in doc["XRef"]["Trailer"]:
            doc["XRef"]["Trailer"][Name("Info")] = Dictionary()

        # change author
        doc["XRef"]["Trailer"]["Info"]["Author"] = String("Joris Schellekens")

        # determine output location
        out_file = self.output_dir / (file.stem + "_out.pdf")
        with open(out_file, "wb") as pdf_file_handle:
            PDF.dumps(pdf_file_handle, doc)

        return True
Example #4
0
    def test_document(self, file) -> bool:

        doc = None
        with open(file, "rb") as pdf_file_handle:
            doc = None
            with open(file, "rb") as pdf_file_handle:
                doc = PDF.loads(pdf_file_handle)

        if "XRef" not in doc:
            return False
        if "Trailer" not in doc["XRef"]:
            return False

        if "Info" not in doc["XRef"]["Trailer"]:
            doc["XRef"]["Trailer"][Name("Info")] = Dictionary()

        # change producer
        doc["XRef"]["Trailer"]["Info"]["Producer"] = String("pText")

        # determine output location
        out_file = self.output_dir / (file.stem + "_out.pdf")
        with open(out_file, "wb") as pdf_file_handle:
            PDF.dumps(out_file, doc)

        return True
Example #5
0
    def _read_trailer(self, src: io.IOBase,
                      tok: HighLevelTokenizer) -> Dictionary:

        # return None if there is no trailer
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text != "trailer":
            return Dictionary()

        # if there is a keyword "trailer" the next token should be TokenType.START_DICT
        token = tok.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.START_DICT:
            raise PDFSyntaxError(
                byte_offset=tok.tell(),
                message="invalid XREF trailer",
            )

        # go back 2 chars "<<"
        src.seek(-2, io.SEEK_CUR)

        # read dictionary as trailer
        trailer_dict = tok.read_dictionary()

        # process startxref
        token = tok.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.OTHER or token.text != "startxref":
            raise PDFSyntaxError(
                byte_offset=token.byte_offset,
                message="start of XREF not found",
            )

        # return
        return trailer_dict
Example #6
0
    def transform(
        self,
        object_to_transform: AnyPDFType,
        context: Optional[WriteTransformerContext] = None,
    ):
        assert context is not None
        assert context.destination is not None
        assert isinstance(object_to_transform, Stream)

        # start object if needed
        started_object = False
        ref = object_to_transform.get_reference(
        )  # type: ignore [attr-defined]
        if ref is not None:
            assert isinstance(ref, Reference)
            assert ref.object_number is not None
            if ref in context.duplicate_references:
                logger.debug("skip writing object %d %d R (duplicate)" %
                             (ref.object_number, ref.generation_number or 0))
                return
            if ref.object_number is not None and ref.byte_offset is None:
                started_object = True
                self.start_object(object_to_transform, context)
            context.duplicate_references.append(ref)

        # build stream dictionary
        stream_dictionary = Dictionary()

        # objects to turn into reference
        queue: typing.List[AnyPDFType] = []
        for k, v in object_to_transform.items():
            if k in ["Bytes", "DecodedBytes"]:
                continue
            if (isinstance(v, Dictionary) or isinstance(v, List)
                    or isinstance(v, Stream)
                ) and v.can_be_referenced():  # type: ignore [union-attr]
                stream_dictionary[k] = self.get_reference(v, context)
                queue.append(v)
            else:
                stream_dictionary[k] = v

        # write stream dictionary
        self.get_root_transformer().transform(stream_dictionary, context)

        # write "stream"
        context.destination.write(bytes("stream\n", "latin1"))

        # write bytes
        context.destination.write(object_to_transform["Bytes"])

        # write "endstream"
        context.destination.write(bytes("\nendstream\n", "latin1"))

        # end object if needed
        if started_object:
            self.end_object(object_to_transform, context)

        for e in queue:
            self.get_root_transformer().transform(e, context)
Example #7
0
    def test_hash_types(self):

        obj0 = Dictionary()
        obj0[Name("Root")] = Reference(object_number=10)
        obj0[Name("Marked")] = Boolean(True)

        obj1 = List()
        obj1.append(Name("Red"))
        obj1.append(Decimal(0.5))

        print(hash(obj1))
Example #8
0
    def transform(
        self,
        object_to_transform: Any,
        context: Optional[WriteTransformerContext] = None,
    ):
        # write header
        assert context is not None
        assert context.destination is not None

        context.destination.write(b"%PDF-1.7\n")
        context.destination.write(b"%")
        context.destination.write(bytes([226, 227, 207, 211]))
        context.destination.write(b"\n")

        # invalidate all references
        WritePDFTransformer._invalidate_all_references(object_to_transform)

        # create Info dictionary if needed
        if "Info" not in object_to_transform["XRef"]["Trailer"]:
            object_to_transform["XRef"]["Trailer"][Name("Info")] = Dictionary()

        # set /ID
        random_id = HexadecimalString("%032x" % random.randrange(16**32))
        if "ID" not in object_to_transform["XRef"]["Trailer"]:
            object_to_transform["XRef"]["Trailer"][Name("ID")] = List(
            ).set_can_be_referenced(  # type: ignore [attr-defined]
                False)
            object_to_transform["XRef"]["Trailer"]["ID"].append(random_id)
            object_to_transform["XRef"]["Trailer"]["ID"].append(random_id)
        else:
            object_to_transform["XRef"]["Trailer"]["ID"][1] = random_id

        # set CreationDate
        modification_date = WritePDFTransformer._timestamp_to_str()
        if "CreationDate" not in object_to_transform["XRef"]["Trailer"][Name(
                "Info")]:
            object_to_transform["XRef"]["Trailer"][Name("Info")][Name(
                "CreationDate")] = String(modification_date)

        # set ModDate
        object_to_transform["XRef"]["Trailer"]["Info"][Name(
            "ModDate")] = String(modification_date)

        # set Producer
        object_to_transform["XRef"]["Trailer"]["Info"][Name(
            "Producer")] = String("pText")

        # transform XREF
        self.get_root_transformer().transform(object_to_transform["XRef"],
                                              context)
Example #9
0
    def read_dictionary(self) -> Dictionary:
        """
        This method processes the next tokens and returns a PDFDictionary.
        It fails and throws various errors if the next tokens do not represent a PDFDictionary.
        """
        token = self.next_non_comment_token()
        if token is None:
            raise PDFEOFError()
        if token.token_type != TokenType.START_DICT:
            raise PDFSyntaxError(message="invalid dictionary",
                                 byte_offset=token.byte_offset)

        out_dict = Dictionary()
        while True:

            # attempt to read name token
            token = self.next_non_comment_token()
            if token is None:
                raise PDFEOFError()
            if token.token_type == TokenType.END_DICT:
                break
            if token.token_type != TokenType.NAME:
                raise PDFSyntaxError(
                    message="dictionary key must be a name",
                    byte_offset=token.byte_offset,
                )

            # store name
            name = Name(token.text[1:])

            # attempt to read value
            value = self.read_object()
            if value is None:
                raise PDFSyntaxError(
                    message="unexpected end of dictionary",
                    byte_offset=token.byte_offset,
                )

            # store in dict object
            if name is not None:
                out_dict[name] = value

        return out_dict
Example #10
0
    def read(
        self,
        io_source: Union[io.BufferedIOBase, io.RawIOBase],
        tokenizer: HighLevelTokenizer,
        initial_offset: Optional[int] = None,
    ) -> "XREF":

        if initial_offset is not None:
            io_source.seek(initial_offset)
        else:
            self._seek_to_xref_token(io_source, tokenizer)

        xref_stream = tokenizer.read_object()
        assert isinstance(xref_stream, Stream)

        # check widths
        assert "W" in xref_stream
        assert all([
            isinstance(xref_stream["W"][x], Decimal)
            for x in range(0, len(xref_stream["W"]))
        ])
        # decode widths
        widths = [
            int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"]))
        ]
        total_entry_width = sum(widths)

        # parent
        document = self.get_root()  # type: ignore [attr-defined]

        # list of references
        indirect_references = [
            Reference(
                object_number=0,
                generation_number=65535,
                is_in_use=False,
                document=document,
            )
        ]

        # check size
        assert "Size" in xref_stream
        assert isinstance(xref_stream["Size"], Decimal)

        # get size
        number_of_objects = int(xref_stream["Size"])

        # index
        index = []
        if "Index" in xref_stream:
            index = xref_stream["Index"]
            assert isinstance(index, List)
            assert len(index) % 2 == 0
            assert isinstance(index[0], Decimal)
            assert isinstance(index[1], Decimal)
        else:
            index = [Decimal(0), Decimal(number_of_objects)]

        # apply filters
        xref_stream = decode_stream(xref_stream)

        # read every range specified in \Index
        xref_stream_decoded_bytes = xref_stream["DecodedBytes"]
        for idx in range(0, len(index), 2):
            start = int(index[idx])
            length = int(index[idx + 1])

            bptr = 0
            for i in range(0, length):

                # object number
                object_number = start + i

                # read type
                type = 1
                if widths[0] > 0:
                    type = 0
                    for j in range(0, widths[0]):
                        type = (type << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                        bptr += 1

                # read field 2
                field2 = 0
                for j in range(0, widths[1]):
                    field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # read field 3
                field3 = 0
                for j in range(0, widths[2]):
                    field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # check type
                assert type in [0, 1, 2]

                pdf_indirect_reference = None
                if type == 0:
                    # type      :The type of this entry, which shall be 0. Type 0 entries define
                    # the linked list of free objects (corresponding to f entries in a
                    # cross-reference table).
                    # field2    : The object number of the next free object
                    # field3    : The generation number to use if this object number is used again
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                        is_in_use=False,
                    )

                if type == 1:
                    # Type      : The type of this entry, which shall be 1. Type 1 entries define
                    # objects that are in use but are not compressed (corresponding
                    # to n entries in a cross-reference table).
                    # field2    : The byte offset of the object, starting from the beginning of the
                    # file.
                    # field3    : The generation number of the object. Default value: 0.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                    )

                if type == 2:
                    # Type      : The type of this entry, which shall be 2. Type 2 entries define
                    # compressed objects.
                    # field2    : The object number of the object stream in which this object is
                    # stored. (The generation number of the object stream shall be
                    # implicitly 0.)
                    # field3    : The index of this object within the object stream.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        generation_number=0,
                        parent_stream_object_number=field2,
                        index_in_parent_stream=field3,
                    )

                assert pdf_indirect_reference is not None

                # append
                existing_indirect_ref = next(
                    iter([
                        x for x in indirect_references
                        if x.object_number is not None
                        and x.object_number == Decimal(object_number)
                    ]),
                    None,
                )
                ref_is_in_reading_state = (
                    existing_indirect_ref is not None
                    and existing_indirect_ref.is_in_use
                    and existing_indirect_ref.generation_number
                    == pdf_indirect_reference.generation_number)
                ref_is_first_encountered = existing_indirect_ref is None or (
                    not ref_is_in_reading_state
                    and existing_indirect_ref.document is None)

                if ref_is_first_encountered:
                    assert pdf_indirect_reference is not None
                    indirect_references.append(pdf_indirect_reference)
                elif ref_is_in_reading_state:
                    assert existing_indirect_ref is not None
                    assert pdf_indirect_reference is not None
                    existing_indirect_ref.index_in_parent_stream = (
                        pdf_indirect_reference.index_in_parent_stream)
                    existing_indirect_ref.parent_stream_object_number = (
                        pdf_indirect_reference.parent_stream_object_number)

        # add section
        for r in indirect_references:
            self.append(r)

        # initialize trailer
        self["Trailer"] = Dictionary(xref_stream)

        # return
        return self
Example #11
0
    def _create_annotation(
        self,
        rectangle: Tuple[Decimal, Decimal, Decimal, Decimal],
        contents: Optional[str] = None,
        color: Optional[Color] = None,
        border_horizontal_corner_radius: Optional[Decimal] = None,
        border_vertical_corner_radius: Optional[Decimal] = None,
        border_width: Optional[Decimal] = None,
    ):
        annot = Dictionary()

        # (Optional) The type of PDF object that this dictionary describes; if
        # present, shall be Annot for an annotation dictionary.
        annot[Name("Type")] = Name("Annot")

        # (Required) The annotation rectangle, defining the location of the
        # annotation on the page in default user space units.
        annot[Name("Rect")] = List().set_can_be_referenced(False)  # type: ignore [attr-defined]
        annot["Rect"].append(pDecimal(rectangle[0]))
        annot["Rect"].append(pDecimal(rectangle[1]))
        annot["Rect"].append(pDecimal(rectangle[2]))
        annot["Rect"].append(pDecimal(rectangle[3]))

        # (Optional) Text that shall be displayed for the annotation or, if this type of
        # annotation does not display text, an alternate description of the
        # annotation’s contents in human-readable form. In either case, this text is
        # useful when extracting the document’s contents in support of
        # accessibility to users with disabilities or for other purposes (see 14.9.3,
        # “Alternate Descriptions”). See 12.5.6, “Annotation Types” for more
        # details on the meaning of this entry for each annotation type.
        if contents is not None:
            annot[Name("Contents")] = String(contents)

        # (Optional except as noted below; PDF 1.3; not used in FDF files) An
        # indirect reference to the page object with which this annotation is
        # associated.
        # This entry shall be present in screen annotations associated with
        # rendition actions (PDF 1.5; see 12.5.6.18, “Screen Annotations” and
        # 12.6.4.13, “Rendition Actions”).
        annot[Name("P")] = self

        # (Optional; PDF 1.4) The annotation name, a text string uniquely
        # identifying it among all the annotations on its page.
        len_annots = len(self["Annots"]) if "Annots" in self else 0
        annot[Name("NM")] = String("annotation-{0:03d}".format(len_annots))

        # (Optional; PDF 1.1) The date and time when the annotation was most
        # recently modified. The format should be a date string as described in
        # 7.9.4, “Dates,” but conforming readers shall accept and display a string
        # in any format.
        annot[Name("M")] = String(self._timestamp_to_str())

        # (Optional; PDF 1.1) A set of flags specifying various characteristics of
        # the annotation (see 12.5.3, “Annotation Flags”). Default value: 0.
        annot[Name("F")] = pDecimal(4)

        # (Optional; PDF 1.2) An appearance dictionary specifying how the
        # annotation shall be presented visually on the page (see 12.5.5,
        # “Appearance Streams”). Individual annotation handlers may ignore this
        # entry and provide their own appearances.
        # annot[Name("AP")] = None

        # (Required if the appearance dictionary AP contains one or more
        # subdictionaries; PDF 1.2) The annotation’s appearance state, which
        # selects the applicable appearance stream from an appearance
        # subdictionary (see Section 12.5.5, “Appearance Streams”).
        # annot[Name("AS")] = None

        # Optional) An array specifying the characteristics of the annotation’s
        # border, which shall be drawn as a rounded rectangle.
        # (PDF 1.0) The array consists of three numbers defining the horizontal
        # corner radius, vertical corner radius, and border width, all in default user
        # space units. If the corner radii are 0, the border has square (not rounded)
        # corners; if the border width is 0, no border is drawn.
        # (PDF 1.1) The array may have a fourth element, an optional dash array
        # defining a pattern of dashes and gaps that shall be used in drawing the
        # border. The dash array shall be specified in the same format as in the
        # line dash pattern parameter of the graphics state (see 8.4.3.6, “Line
        # Dash Pattern”).
        if (
            border_horizontal_corner_radius is not None
            and border_vertical_corner_radius is not None
            and border_width is not None
        ):
            annot[Name("Border")] = List().set_can_be_referenced(False)  # type: ignore [attr-defined]
            annot["Border"].append(pDecimal(border_horizontal_corner_radius))
            annot["Border"].append(pDecimal(border_vertical_corner_radius))
            annot["Border"].append(pDecimal(border_width))

        # (Optional; PDF 1.1) An array of numbers in the range 0.0 to 1.0,
        # representing a colour used for the following purposes:
        # The background of the annotation’s icon when closed
        # The title bar of the annotation’s pop-up window
        # The border of a link annotation
        # The number of array elements determines the colour space in which the
        # colour shall be defined
        if color is not None:
            color_max = pDecimal(256)
            annot[Name("C")] = List().set_can_be_referenced(False)  # type: ignore [attr-defined]
            annot["C"].append(pDecimal(color.to_rgb().red / color_max))
            annot["C"].append(pDecimal(color.to_rgb().green / color_max))
            annot["C"].append(pDecimal(color.to_rgb().blue / color_max))

        # (Required if the annotation is a structural content item; PDF 1.3) The
        # integer key of the annotation’s entry in the structural parent tree (see
        # 14.7.4.4, “Finding Structure Elements from Content Items”)
        # annot[Name("StructParent")] = None

        # (Optional; PDF 1.5) An optional content group or optional content
        # membership dictionary (see 8.11, “Optional Content”) specifying the
        # optional content properties for the annotation. Before the annotation is
        # drawn, its visibility shall be determined based on this entry as well as the
        # annotation flags specified in the F entry (see 12.5.3, “Annotation Flags”).
        # If it is determined to be invisible, the annotation shall be skipped, as if it
        # were not in the document.
        # annot[Name("OC")] = None

        # return
        return annot
Example #12
0
    def read(self, io_source: io.IOBase) -> "Canvas":

        io_source.seek(0, os.SEEK_END)
        length = io_source.tell()
        io_source.seek(0)

        canvas_tokenizer = HighLevelTokenizer(io_source)

        # process content
        operand_stk = []
        while canvas_tokenizer.tell() != length:

            # attempt to read object
            obj = canvas_tokenizer.read_object()
            if obj is None:
                break

            # push argument onto stack
            if not isinstance(obj, CanvasOperatorName):
                operand_stk.append(obj)
                continue

            # process operator
            candidate_ops = [
                x for x in self.canvas_operators if x.get_text() == str(obj)
            ]
            if len(candidate_ops) == 1:
                operator = candidate_ops[0]
                if len(operand_stk) < operator.get_number_of_operands():
                    # if we are in a compatibility section ignore any possible mistake
                    if self.in_compatibility_section:
                        continue
                    raise IllegalGraphicsStateError(
                        message=
                        "Unable to execute operator %s. Expected %d arguments, received %d."
                        % (
                            operator.text,
                            operator.get_number_of_operands(),
                            len(operand_stk),
                        ))
                operands: typing.List["CanvasOperator"] = [
                ]  # type: ignore [name-defined]
                for _ in range(0, operator.get_number_of_operands()):
                    operands.insert(0, operand_stk.pop(-1))

                # append
                if "Instructions" not in self:
                    self["Instructions"] = List().set_parent(
                        self)  # type: ignore [attr-defined]

                instruction_number = len(self["Instructions"])
                instruction_dictionary = Dictionary()
                instruction_dictionary["Name"] = operator.get_text()
                instruction_dictionary["Args"] = List(
                ).set_parent(  # type: ignore [attr-defined]
                    instruction_dictionary)

                if len(operands) > 0:
                    for i in range(0, len(operands)):
                        instruction_dictionary["Args"].append(operands[i])
                self["Instructions"].append(instruction_dictionary)

                # debug
                logger.debug("%d %s %s" % (
                    instruction_number,
                    operator.text,
                    str([str(x) for x in operands]),
                ))

                # invoke
                try:
                    operator.invoke(self, operands)
                except Exception as e:
                    if not self.in_compatibility_section:
                        raise e

            # unknown operator
            if len(candidate_ops) == 0:
                # print("Missing OPERATOR %s" % obj)
                pass

        # return
        return self
Example #13
0
    def transform(
        self,
        object_to_transform: AnyPDFType,
        context: Optional[WriteTransformerContext] = None,
    ):
        assert isinstance(object_to_transform, XREF)
        assert "Trailer" in object_to_transform
        assert isinstance(object_to_transform["Trailer"], Dictionary)
        assert context is not None
        assert context.destination is not None

        # transform Trailer dictionary (replacing objects by references)
        trailer_out = Dictionary()
        # /Root
        trailer_out[Name("Root")] = self.get_reference(
            object_to_transform["Trailer"]["Root"], context)
        # /Info
        if "Info" in object_to_transform["Trailer"]:
            trailer_out[Name("Info")] = self.get_reference(
                object_to_transform["Trailer"]["Info"], context)
        # /Size
        if ("Trailer" in object_to_transform
                and "Size" in object_to_transform["Trailer"]):
            trailer_out[Name("Size")] = object_to_transform["Trailer"]["Size"]
        else:
            trailer_out[Name("Size")] = Decimal(0)
        # /ID
        if "ID" in object_to_transform["Trailer"]:
            trailer_out[Name("ID")] = self.get_reference(
                object_to_transform["Trailer"]["ID"], context)

        # write Root object
        self.get_root_transformer().transform(
            object_to_transform["Trailer"]["Root"], context)

        # write Info object
        if "Info" in object_to_transform["Trailer"]:
            self.get_root_transformer().transform(
                object_to_transform["Trailer"]["Info"], context)

        # write ID object
        if "ID" in object_to_transform["Trailer"]:
            self.get_root_transformer().transform(
                object_to_transform["Trailer"]["ID"], context)

        # write XREF
        start_of_xref = context.destination.tell()
        context.destination.write(bytes("xref\n", "latin1"))
        for section in self._section_xref(context):
            context.destination.write(
                bytes("%d %d\n" % (section[0].object_number, len(section)),
                      "latin1"))
            for r in section:
                if r.is_in_use:
                    context.destination.write(
                        bytes("{0:010d} 00000 n\n".format(r.byte_offset),
                              "latin1"))
                else:
                    context.destination.write(
                        bytes("{0:010d} 00000 f\n".format(r.byte_offset),
                              "latin1"))

        # update Size
        trailer_out[Name("Size")] = Decimal(
            sum([len(v) for k, v in context.indirect_objects.items()]))

        # write Trailer
        context.destination.write(bytes("trailer\n", "latin1"))
        self.get_root_transformer().transform(trailer_out, context)
        context.destination.write(bytes("startxref\n", "latin1"))

        # write byte offset of last cross-reference section
        context.destination.write(bytes(str(start_of_xref) + "\n", "latin1"))

        # write EOF
        context.destination.write(bytes("%%EOF", "latin1"))
    def read(self, io_source: io.IOBase) -> "Canvas":

        io_source.seek(0, os.SEEK_END)
        length = io_source.tell()
        io_source.seek(0)

        canvas_tokenizer = HighLevelTokenizer(io_source)

        # process content
        operand_stk = []
        while canvas_tokenizer.tell() != length:

            # print("<canvas pos='%d' length='%d' percentage='%d'/>" % ( canvas_tokenizer.tell(), length, int(canvas_tokenizer.tell() * 100 / length)))

            # attempt to read object
            obj = canvas_tokenizer.read_object()
            if obj is None:
                break

            # push argument onto stack
            if not isinstance(obj, CanvasOperatorName):
                operand_stk.append(obj)
                continue

            # process operator
            operator = self.canvas_operators.get(obj, None)
            if operator is None:
                logger.debug("Missing operator %s" % obj)
                continue

            if not self.in_compatibility_section:
                assert len(operand_stk) >= operator.get_number_of_operands()
            operands: typing.List["CanvasOperator"] = []  # type: ignore [name-defined]
            for _ in range(0, operator.get_number_of_operands()):
                operands.insert(0, operand_stk.pop(-1))

            # append
            if "Instructions" not in self:
                self["Instructions"] = List().set_parent(self)  # type: ignore [attr-defined]

            instruction_number = len(self["Instructions"])
            instruction_dictionary = Dictionary()
            instruction_dictionary["Name"] = operator.get_text()
            instruction_dictionary["Args"] = List().set_parent(  # type: ignore [attr-defined]
                instruction_dictionary
            )

            if len(operands) > 0:
                for i in range(0, len(operands)):
                    instruction_dictionary["Args"].append(operands[i])
            self["Instructions"].append(instruction_dictionary)

            # debug
            logger.debug(
                "%d %s %s"
                % (
                    instruction_number,
                    operator.text,
                    str([str(x) for x in operands]),
                )
            )

            # invoke
            try:
                operator.invoke(self, operands)
            except Exception as e:
                if not self.in_compatibility_section:
                    raise e

        # return
        return self
Example #15
0
def decode_stream(s: Stream) -> Stream:

    assert isinstance(s, Stream)
    assert "Bytes" in s

    # determine filter(s) to apply
    filters: typing.List[str] = []
    if "Filter" in s:
        if isinstance(s["Filter"], List):
            filters = s["Filter"]
        else:
            filters = [s["Filter"]]

    decode_params: typing.List[Dictionary] = []
    if "DecodeParms" in s:
        if isinstance(s["DecodeParms"], List):
            decode_params = s["DecodeParms"]
        else:
            assert s["DecodeParms"] is not None
            assert isinstance(s["DecodeParms"], Dictionary)
            decode_params = [s["DecodeParms"]]
    else:
        decode_params = [Dictionary() for x in range(0, len(filters))]

    # apply filter(s)
    transformed_bytes = s["Bytes"]
    for filter_index, filter_name in enumerate(filters):
        # FLATE
        if filter_name in ["FlateDecode", "Fl"]:
            transformed_bytes = FlateDecode.decode(
                bytes_in=transformed_bytes,
                columns=int(decode_params[filter_index].get(
                    "Columns", Decimal(1))),
                predictor=int(decode_params[filter_index].get(
                    "Predictor", Decimal(1))),
                bits_per_component=int(decode_params[filter_index].get(
                    "BitsPerComponent", Decimal(8))),
            )
            continue

        # ASCII85
        if filter_name in ["ASCII85Decode"]:
            transformed_bytes = ASCII85Decode.decode(transformed_bytes)
            continue

        # LZW
        if filter_name in ["LZWDecode"]:
            transformed_bytes = LZWDecode.decode(transformed_bytes)
            continue

        # RunLengthDecode
        if filter_name in ["RunLengthDecode"]:
            transformed_bytes = RunLengthDecode.decode(transformed_bytes)
            continue

        # unknown filter
        raise PDFValueError(
            expected_value_description=
            "[/ASCII85Decode, /FlateDecode, /Fl, /LZWDecode, /RunLengthDecode]",
            received_value_description=str(filter_name),
        )

    # set DecodedBytes
    s[Name("DecodedBytes")] = transformed_bytes

    # set Type if not yet set
    if "Type" not in s:
        s[Name("Type")] = Name("Stream")

    # return
    return s