def get_reference(self, object: AnyPDFType,
                      context: WriteTransformerContext) -> Reference:

        # look through existing indirect objects
        obj_hash: int = hash(object)
        if obj_hash in context.indirect_objects:
            for obj in context.indirect_objects[obj_hash]:
                if obj == object:
                    ref = obj.get_reference()  # type: ignore [union-attr]
                    assert ref is not None
                    assert isinstance(ref, Reference)
                    return ref

        # generate new object number
        existing_obj_numbers = set([
            item.get_reference().object_number  # type: ignore [union-attr]
            for sublist in [v for k, v in context.indirect_objects.items()]
            for item in sublist
        ])
        obj_number = len(existing_obj_numbers) + 1
        while obj_number in existing_obj_numbers:  # type: ignore [union-attr]
            obj_number += 1

        # build reference
        ref = Reference(object_number=obj_number)
        object.set_reference(ref)  # type: ignore [union-attr]

        # insert into context
        if obj_hash in context.indirect_objects:
            context.indirect_objects[obj_hash].append(object)
        else:
            context.indirect_objects[obj_hash] = [object]

        # return
        return ref
Beispiel #2
0
    def _read_section(self, src: io.IOBase,
                      tok: HighLevelTokenizer) -> List[Reference]:

        tokens = [tok.next_non_comment_token() for _ in range(0, 2)]
        assert tokens[0] is not None
        assert tokens[1] is not None
        if tokens[0].text in ["trailer", "startxref"]:
            src.seek(tokens[0].byte_offset)
            return []
        if tokens[0].token_type != TokenType.NUMBER:
            raise PDFValueError(
                byte_offset=tokens[0].byte_offset,
                expected_value_description="number",
                received_value_description=tokens[0].text,
            )
        if tokens[1].token_type != TokenType.NUMBER:
            raise PDFValueError(
                byte_offset=tokens[1].byte_offset,
                expected_value_description="number",
                received_value_description=tokens[1].text,
            )

        start_object_number = int(tokens[0].text)
        number_of_objects = int(tokens[1].text)
        indirect_references = []

        # read subsection
        for i in range(0, number_of_objects):
            tokens = [tok.next_non_comment_token() for _ in range(0, 3)]
            assert tokens[0] is not None
            assert tokens[1] is not None
            assert tokens[2] is not None
            if tokens[0].text in ["trailer", "startxref"]:
                raise PDFSyntaxError(
                    byte_offset=tokens[0].byte_offset,
                    message="unexpected EOF while processing XREF",
                )
            if (tokens[0].token_type != TokenType.NUMBER
                    or tokens[1].token_type != TokenType.NUMBER
                    or tokens[2].token_type != TokenType.OTHER
                    or tokens[2].text not in ["f", "n"]):
                raise PDFSyntaxError(
                    byte_offset=tokens[0].byte_offset,
                    message="invalid XREF line",
                )

            indirect_references.append(
                Reference(
                    object_number=start_object_number + i,
                    byte_offset=int(tokens[0].text),
                    generation_number=int(tokens[1].text),
                    is_in_use=(tokens[2].text == "n"),
                ))

        # return
        return indirect_references
Beispiel #3
0
    def test_hash_types(self):

        obj0 = Dictionary()
        obj0[Name("Root")] = Reference(object_number=10)
        obj0[Name("Marked")] = Boolean(True)

        obj1 = List()
        obj1.append(Name("Red"))
        obj1.append(Decimal(0.5))

        print(hash(obj1))
Beispiel #4
0
    def _section_xref(self, context: Optional[WriteTransformerContext] = None):
        assert context is not None

        # get all references
        indirect_objects: typing.List[AnyPDFType] = [
            item
            for sublist in [v for k, v in context.indirect_objects.items()]
            for item in sublist
        ]
        references: typing.List[Reference] = []
        for obj in indirect_objects:
            ref = obj.get_reference()  # type: ignore [union-attr]
            if ref is not None:
                references.append(ref)
        # sort
        references.sort(key=lambda x: x.object_number)

        # insert magic entry if needed
        if len(references) == 0 or references[0].generation_number != 65535:
            references.insert(
                0,
                Reference(
                    generation_number=65535,
                    object_number=0,
                    byte_offset=0,
                    is_in_use=False,
                ),
            )

        # divide into sections
        sections = [[references[0]]]
        for i in range(1, len(references)):
            ref = references[i]
            prev_object_number = sections[-1][-1].object_number
            assert prev_object_number is not None
            if ref.object_number == prev_object_number + 1:
                sections[-1].append(ref)
            else:
                sections.append([ref])

        # return
        return sections
Beispiel #5
0
    def read_indirect_object(self) -> Optional[AnyPDFType]:
        """
        This method processes the next tokens and returns an indirect PDFObject.
        It fails and throws various errors if the next tokens do not represent an indirect PDFObject.
        """

        # read object number
        token = self.next_non_comment_token()
        assert token is not None
        byte_offset = token.byte_offset
        if token.token_type != TokenType.NUMBER or not re.match(
                "^[0-9]+$", token.text):
            self.seek(byte_offset)
            return None
        object_number = int(token.text)

        # read generation number
        token = self.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.NUMBER or not re.match(
                "^[0-9]+$", token.text):
            self.seek(byte_offset)
            return None
        generation_number = int(token.text)

        # read 'obj'
        token = self.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.OTHER or token.text != "obj":
            self.seek(byte_offset)
            return None

        # read obj
        value = self.read_object()
        if value is not None:
            value.set_reference(  # type: ignore[union-attr]
                Reference(object_number=object_number,
                          generation_number=generation_number))

        # return
        return value
Beispiel #6
0
    def read_indirect_reference(self) -> Optional[Reference]:
        """
        This method processes the next tokens and returns an indirect reference.
        It fails and throws various errors if the next tokens do not represent an indirect reference.
        """

        # read object number
        token = self.next_non_comment_token()
        assert token is not None
        byte_offset = token.byte_offset
        if token.token_type != TokenType.NUMBER or not re.match(
                "^[0-9]+$", token.text):
            self.seek(byte_offset)
            return None
        object_number = int(token.text)

        # read generation number
        token = self.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.NUMBER or not re.match(
                "^[0-9]+$", token.text):
            self.seek(byte_offset)
            return None
        generation_number = int(token.text)

        # read 'R'
        token = self.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.OTHER or token.text != "R":
            self.seek(byte_offset)
            return None

        # return
        return Reference(
            object_number=object_number,
            generation_number=generation_number,
        )
Beispiel #7
0
    def read(
        self,
        io_source: Union[io.BufferedIOBase, io.RawIOBase],
        tokenizer: HighLevelTokenizer,
        initial_offset: Optional[int] = None,
    ) -> "XREF":

        if initial_offset is not None:
            io_source.seek(initial_offset)
        else:
            self._seek_to_xref_token(io_source, tokenizer)

        xref_stream = tokenizer.read_object()
        assert isinstance(xref_stream, Stream)

        # check widths
        assert "W" in xref_stream
        assert all([
            isinstance(xref_stream["W"][x], Decimal)
            for x in range(0, len(xref_stream["W"]))
        ])
        # decode widths
        widths = [
            int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"]))
        ]
        total_entry_width = sum(widths)

        # parent
        document = self.get_root()  # type: ignore [attr-defined]

        # list of references
        indirect_references = [
            Reference(
                object_number=0,
                generation_number=65535,
                is_in_use=False,
                document=document,
            )
        ]

        # check size
        assert "Size" in xref_stream
        assert isinstance(xref_stream["Size"], Decimal)

        # get size
        number_of_objects = int(xref_stream["Size"])

        # index
        index = []
        if "Index" in xref_stream:
            index = xref_stream["Index"]
            assert isinstance(index, List)
            assert len(index) % 2 == 0
            assert isinstance(index[0], Decimal)
            assert isinstance(index[1], Decimal)
        else:
            index = [Decimal(0), Decimal(number_of_objects)]

        # apply filters
        xref_stream = decode_stream(xref_stream)

        # read every range specified in \Index
        xref_stream_decoded_bytes = xref_stream["DecodedBytes"]
        for idx in range(0, len(index), 2):
            start = int(index[idx])
            length = int(index[idx + 1])

            bptr = 0
            for i in range(0, length):

                # object number
                object_number = start + i

                # read type
                type = 1
                if widths[0] > 0:
                    type = 0
                    for j in range(0, widths[0]):
                        type = (type << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                        bptr += 1

                # read field 2
                field2 = 0
                for j in range(0, widths[1]):
                    field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # read field 3
                field3 = 0
                for j in range(0, widths[2]):
                    field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # check type
                assert type in [0, 1, 2]

                pdf_indirect_reference = None
                if type == 0:
                    # type      :The type of this entry, which shall be 0. Type 0 entries define
                    # the linked list of free objects (corresponding to f entries in a
                    # cross-reference table).
                    # field2    : The object number of the next free object
                    # field3    : The generation number to use if this object number is used again
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                        is_in_use=False,
                    )

                if type == 1:
                    # Type      : The type of this entry, which shall be 1. Type 1 entries define
                    # objects that are in use but are not compressed (corresponding
                    # to n entries in a cross-reference table).
                    # field2    : The byte offset of the object, starting from the beginning of the
                    # file.
                    # field3    : The generation number of the object. Default value: 0.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                    )

                if type == 2:
                    # Type      : The type of this entry, which shall be 2. Type 2 entries define
                    # compressed objects.
                    # field2    : The object number of the object stream in which this object is
                    # stored. (The generation number of the object stream shall be
                    # implicitly 0.)
                    # field3    : The index of this object within the object stream.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        generation_number=0,
                        parent_stream_object_number=field2,
                        index_in_parent_stream=field3,
                    )

                assert pdf_indirect_reference is not None

                # append
                existing_indirect_ref = next(
                    iter([
                        x for x in indirect_references
                        if x.object_number is not None
                        and x.object_number == Decimal(object_number)
                    ]),
                    None,
                )
                ref_is_in_reading_state = (
                    existing_indirect_ref is not None
                    and existing_indirect_ref.is_in_use
                    and existing_indirect_ref.generation_number
                    == pdf_indirect_reference.generation_number)
                ref_is_first_encountered = existing_indirect_ref is None or (
                    not ref_is_in_reading_state
                    and existing_indirect_ref.document is None)

                if ref_is_first_encountered:
                    assert pdf_indirect_reference is not None
                    indirect_references.append(pdf_indirect_reference)
                elif ref_is_in_reading_state:
                    assert existing_indirect_ref is not None
                    assert pdf_indirect_reference is not None
                    existing_indirect_ref.index_in_parent_stream = (
                        pdf_indirect_reference.index_in_parent_stream)
                    existing_indirect_ref.parent_stream_object_number = (
                        pdf_indirect_reference.parent_stream_object_number)

        # add section
        for r in indirect_references:
            self.append(r)

        # initialize trailer
        self["Trailer"] = Dictionary(xref_stream)

        # return
        return self