Python HighLevelTokenizerの例、ptext.io.tokenize.high_level_tokenizer.HighLevelTokenizer Pythonの例

コード例 #1

0

ファイルを表示

    def _read_trailer(self, src: io.IOBase,
                      tok: HighLevelTokenizer) -> Dictionary:

        # return None if there is no trailer
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text != "trailer":
            return Dictionary()

        # if there is a keyword "trailer" the next token should be TokenType.START_DICT
        token = tok.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.START_DICT:
            raise PDFSyntaxError(
                byte_offset=tok.tell(),
                message="invalid XREF trailer",
            )

        # go back 2 chars "<<"
        src.seek(-2, io.SEEK_CUR)

        # read dictionary as trailer
        trailer_dict = tok.read_dictionary()

        # process startxref
        token = tok.next_non_comment_token()
        assert token is not None
        if token.token_type != TokenType.OTHER or token.text != "startxref":
            raise PDFSyntaxError(
                byte_offset=token.byte_offset,
                message="start of XREF not found",
            )

        # return
        return trailer_dict

コード例 #2

0

ファイルを表示

    def _seek_to_xref_token(self, src: io.IOBase, tok: HighLevelTokenizer):

        # find "startxref" text
        start_of_xref_token_byte_offset = self._find_backwards(
            src, tok, "startxref")
        assert start_of_xref_token_byte_offset is not None
        if start_of_xref_token_byte_offset == -1:
            raise StartXREFTokenNotFoundError()

        # set tokenizer to "startxref"
        src.seek(start_of_xref_token_byte_offset)
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text == "xref":
            src.seek(start_of_xref_token_byte_offset)
            return

        # if we are at startxref, we are reading the XREF table backwards
        # and we need to go back to the start of XREF
        if token.text == "startxref":
            token = tok.next_non_comment_token()
            assert token is not None
            if token.token_type != TokenType.NUMBER:
                raise PDFSyntaxError(byte_offset=token.byte_offset,
                                     message="invalid XREF")

            start_of_xref_offset = int(token.text)
            src.seek(start_of_xref_offset)

コード例 #3

0

ファイルを表示

    def _read_section(self, src: io.IOBase,
                      tok: HighLevelTokenizer) -> List[Reference]:

        tokens = [tok.next_non_comment_token() for _ in range(0, 2)]
        assert tokens[0] is not None
        assert tokens[1] is not None
        if tokens[0].text in ["trailer", "startxref"]:
            src.seek(tokens[0].byte_offset)
            return []
        if tokens[0].token_type != TokenType.NUMBER:
            raise PDFValueError(
                byte_offset=tokens[0].byte_offset,
                expected_value_description="number",
                received_value_description=tokens[0].text,
            )
        if tokens[1].token_type != TokenType.NUMBER:
            raise PDFValueError(
                byte_offset=tokens[1].byte_offset,
                expected_value_description="number",
                received_value_description=tokens[1].text,
            )

        start_object_number = int(tokens[0].text)
        number_of_objects = int(tokens[1].text)
        indirect_references = []

        # read subsection
        for i in range(0, number_of_objects):
            tokens = [tok.next_non_comment_token() for _ in range(0, 3)]
            assert tokens[0] is not None
            assert tokens[1] is not None
            assert tokens[2] is not None
            if tokens[0].text in ["trailer", "startxref"]:
                raise PDFSyntaxError(
                    byte_offset=tokens[0].byte_offset,
                    message="unexpected EOF while processing XREF",
                )
            if (tokens[0].token_type != TokenType.NUMBER
                    or tokens[1].token_type != TokenType.NUMBER
                    or tokens[2].token_type != TokenType.OTHER
                    or tokens[2].text not in ["f", "n"]):
                raise PDFSyntaxError(
                    byte_offset=tokens[0].byte_offset,
                    message="invalid XREF line",
                )

            indirect_references.append(
                Reference(
                    object_number=start_object_number + i,
                    byte_offset=int(tokens[0].text),
                    generation_number=int(tokens[1].text),
                    is_in_use=(tokens[2].text == "n"),
                ))

        # return
        return indirect_references

コード例 #4

0

ファイルを表示

    def _find_backwards(
        self,
        src: io.IOBase,
        tok: HighLevelTokenizer,
        text_to_find: str,
    ) -> int:

        # length of str to check
        str_len = 1024

        # go to end of file
        src.seek(0, io.SEEK_END)
        file_length = src.tell()

        pos = file_length - str_len
        if pos < 1:
            pos = 1

        while pos > 0:
            src.seek(pos)
            bytes_near_eof = "".join(
                [tok._next_char() for _ in range(0, str_len)])
            idx = bytes_near_eof.find(text_to_find)
            if idx >= 0:
                return pos + idx
            pos = pos - str_len + len(text_to_find)

        # raise error
        return -1

コード例 #5

0

ファイルを表示

    def read(
        self,
        src: io.IOBase,
        tok: HighLevelTokenizer,
        initial_offset: Optional[int] = None,
    ) -> "XREF":

        if initial_offset is not None:
            src.seek(initial_offset)
        else:
            self._seek_to_xref_token(src, tok)

        # now we should be back to the start of XREF
        token = tok.next_non_comment_token()
        assert token is not None
        if token.text != "xref":
            raise XREFTokenNotFoundError()

        # read xref sections
        while True:
            xref_section = self._read_section(src, tok)
            if len(xref_section) == 0:
                break
            else:
                for r in xref_section:
                    self.append(r)

        # process trailer
        self["Trailer"] = self._read_trailer(src, tok)

        # return self
        return self

コード例 #6

0

ファイルを表示

ファイル: cmap.py プロジェクト: AbdullahMohammadKhan/ptext-release

    def read(self, cmap_bytes: str) -> "CMap":

        N = len(cmap_bytes)
        tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1")))

        prev_token = None
        while tok.tell() < N:

            token = tok.next_non_comment_token()
            if token is None:
                break

            # beginbfchar
            if token.text == "beginbfchar":
                n = int(prev_token.text)
                for j in range(0, n):
                    c = self._hex_string_to_int_or_tuple(tok.read_object())
                    uc = self._hex_string_to_int_or_tuple(tok.read_object())
                    self._add_symbol(c, uc)
                continue

            # beginbfrange
            if token.text == "beginbfrange":
                n = int(prev_token.text)
                for j in range(0, n):

                    c_start_token = tok.read_object()
                    c_start = int(c_start_token, 16)

                    c_end_token = tok.read_object()
                    c_end = int(c_end_token, 16)

                    tmp = tok.read_object()
                    if isinstance(tmp, HexadecimalString):
                        uc = self._hex_string_to_int_or_tuple(tmp)
                        for k in range(0, c_end - c_start + 1):
                            if isinstance(uc, int):
                                self._add_symbol(c_start + k, uc + k)
                            elif isinstance(uc, tuple):
                                self._add_symbol(c_start + k,
                                                 (uc[0], uc[1] + k))

                    elif isinstance(tmp, list):
                        for k in range(0, c_end - c_start + 1):
                            uc = self._hex_string_to_int_or_tuple(tmp[k])
                            self._add_symbol(c_start + k, uc)

            # default
            prev_token = token

        return self

コード例 #7

0

ファイルを表示

ファイル: default_xref_transformer.py プロジェクト: AbdullahMohammadKhan/ptext-release

    def transform(
        self,
        object_to_transform: Union[io.BufferedIOBase, io.RawIOBase,
                                   AnyPDFType],
        parent_object: Any,
        context: Optional[TransformerContext] = None,
        event_listeners: typing.List[EventListener] = [],
    ) -> Any:

        # update context
        assert context is not None
        assert isinstance(object_to_transform,
                          io.BufferedIOBase) or isinstance(
                              object_to_transform, io.RawIOBase)
        context.root_object = Document()
        context.source = object_to_transform
        context.tokenizer = HighLevelTokenizer(context.source)

        # add listener(s)
        for l in event_listeners:
            context.root_object.add_event_listener(l)

        # remove prefix
        self._remove_prefix(context)

        # check header
        self._check_header(context)

        # file size
        context.source.seek(0, os.SEEK_END)
        file_length = context.source.tell()
        context.source.seek(0)
        context.root_object["FileSize"] = Decimal(file_length)

        # build XREF object
        self._read_xref(context)

        # transform trailer dictionary
        xref = context.root_object.get("XRef")

        if "Trailer" in xref and "Encrypt" in xref["Trailer"]:
            # TODO
            raise NotImplementedError(
                "password-protected PDFs are currently not supported")
        trailer = self.get_root_transformer().transform(
            context.root_object["XRef"]["Trailer"],
            context.root_object,
            context,
            [],
        )

        xref["Trailer"] = trailer
        for k in ["DecodeParms", "Filter", "Index", "Length", "Prev", "W"]:
            if k in xref["Trailer"]:
                xref["Trailer"].pop(k)

        # return
        return context.root_object

コード例 #8

0

ファイルを表示

    def read(
        self,
        io_source: Union[io.BufferedIOBase, io.RawIOBase],
        tokenizer: HighLevelTokenizer,
        initial_offset: Optional[int] = None,
    ) -> "XREF":

        if initial_offset is not None:
            io_source.seek(initial_offset)
        else:
            self._seek_to_xref_token(io_source, tokenizer)

        xref_stream = tokenizer.read_object()
        assert isinstance(xref_stream, Stream)

        # check widths
        assert "W" in xref_stream
        assert all([
            isinstance(xref_stream["W"][x], Decimal)
            for x in range(0, len(xref_stream["W"]))
        ])
        # decode widths
        widths = [
            int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"]))
        ]
        total_entry_width = sum(widths)

        # parent
        document = self.get_root()  # type: ignore [attr-defined]

        # list of references
        indirect_references = [
            Reference(
                object_number=0,
                generation_number=65535,
                is_in_use=False,
                document=document,
            )
        ]

        # check size
        assert "Size" in xref_stream
        assert isinstance(xref_stream["Size"], Decimal)

        # get size
        number_of_objects = int(xref_stream["Size"])

        # index
        index = []
        if "Index" in xref_stream:
            index = xref_stream["Index"]
            assert isinstance(index, List)
            assert len(index) % 2 == 0
            assert isinstance(index[0], Decimal)
            assert isinstance(index[1], Decimal)
        else:
            index = [Decimal(0), Decimal(number_of_objects)]

        # apply filters
        xref_stream = decode_stream(xref_stream)

        # read every range specified in \Index
        xref_stream_decoded_bytes = xref_stream["DecodedBytes"]
        for idx in range(0, len(index), 2):
            start = int(index[idx])
            length = int(index[idx + 1])

            bptr = 0
            for i in range(0, length):

                # object number
                object_number = start + i

                # read type
                type = 1
                if widths[0] > 0:
                    type = 0
                    for j in range(0, widths[0]):
                        type = (type << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                        bptr += 1

                # read field 2
                field2 = 0
                for j in range(0, widths[1]):
                    field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # read field 3
                field3 = 0
                for j in range(0, widths[2]):
                    field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr]
                                              & 0xFF)
                    bptr += 1

                # check type
                assert type in [0, 1, 2]

                pdf_indirect_reference = None
                if type == 0:
                    # type      :The type of this entry, which shall be 0. Type 0 entries define
                    # the linked list of free objects (corresponding to f entries in a
                    # cross-reference table).
                    # field2    : The object number of the next free object
                    # field3    : The generation number to use if this object number is used again
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                        is_in_use=False,
                    )

                if type == 1:
                    # Type      : The type of this entry, which shall be 1. Type 1 entries define
                    # objects that are in use but are not compressed (corresponding
                    # to n entries in a cross-reference table).
                    # field2    : The byte offset of the object, starting from the beginning of the
                    # file.
                    # field3    : The generation number of the object. Default value: 0.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        byte_offset=field2,
                        generation_number=field3,
                    )

                if type == 2:
                    # Type      : The type of this entry, which shall be 2. Type 2 entries define
                    # compressed objects.
                    # field2    : The object number of the object stream in which this object is
                    # stored. (The generation number of the object stream shall be
                    # implicitly 0.)
                    # field3    : The index of this object within the object stream.
                    pdf_indirect_reference = Reference(
                        document=document,
                        object_number=object_number,
                        generation_number=0,
                        parent_stream_object_number=field2,
                        index_in_parent_stream=field3,
                    )

                assert pdf_indirect_reference is not None

                # append
                existing_indirect_ref = next(
                    iter([
                        x for x in indirect_references
                        if x.object_number is not None
                        and x.object_number == Decimal(object_number)
                    ]),
                    None,
                )
                ref_is_in_reading_state = (
                    existing_indirect_ref is not None
                    and existing_indirect_ref.is_in_use
                    and existing_indirect_ref.generation_number
                    == pdf_indirect_reference.generation_number)
                ref_is_first_encountered = existing_indirect_ref is None or (
                    not ref_is_in_reading_state
                    and existing_indirect_ref.document is None)

                if ref_is_first_encountered:
                    assert pdf_indirect_reference is not None
                    indirect_references.append(pdf_indirect_reference)
                elif ref_is_in_reading_state:
                    assert existing_indirect_ref is not None
                    assert pdf_indirect_reference is not None
                    existing_indirect_ref.index_in_parent_stream = (
                        pdf_indirect_reference.index_in_parent_stream)
                    existing_indirect_ref.parent_stream_object_number = (
                        pdf_indirect_reference.parent_stream_object_number)

        # add section
        for r in indirect_references:
            self.append(r)

        # initialize trailer
        self["Trailer"] = Dictionary(xref_stream)

        # return
        return self

コード例 #9

0

ファイルを表示

    def read(self, io_source: io.IOBase) -> "Canvas":

        io_source.seek(0, os.SEEK_END)
        length = io_source.tell()
        io_source.seek(0)

        canvas_tokenizer = HighLevelTokenizer(io_source)

        # process content
        operand_stk = []
        while canvas_tokenizer.tell() != length:

            # attempt to read object
            obj = canvas_tokenizer.read_object()
            if obj is None:
                break

            # push argument onto stack
            if not isinstance(obj, CanvasOperatorName):
                operand_stk.append(obj)
                continue

            # process operator
            candidate_ops = [
                x for x in self.canvas_operators if x.get_text() == str(obj)
            ]
            if len(candidate_ops) == 1:
                operator = candidate_ops[0]
                if len(operand_stk) < operator.get_number_of_operands():
                    # if we are in a compatibility section ignore any possible mistake
                    if self.in_compatibility_section:
                        continue
                    raise IllegalGraphicsStateError(
                        message="Unable to execute operator %s. Expected %d arguments, received %d."
                        % (
                            operator.text,
                            operator.get_number_of_operands(),
                            len(operand_stk),
                        )
                    )
                operands = []
                for _ in range(0, operator.get_number_of_operands()):
                    operands.insert(0, operand_stk.pop(-1))

                # append
                if "Instructions" not in self:
                    self["Instructions"] = List().set_parent(self)

                instruction_number = len(self["Instructions"])
                instruction_dictionary = Dictionary()
                instruction_dictionary["Name"] = operator.get_text()
                instruction_dictionary["Args"] = List().set_parent(
                    instruction_dictionary
                )

                if len(operands) > 0:
                    for i in range(0, len(operands)):
                        instruction_dictionary["Args"].append(operands[i])
                self["Instructions"].append(instruction_dictionary)

                # debug
                logger.debug(
                    "%d %s %s"
                    % (
                        instruction_number,
                        operator.text,
                        str([str(x) for x in operands]),
                    )
                )

                # invoke
                try:
                    operator.invoke(self, operands)
                except Exception as e:
                    if not self.in_compatibility_section:
                        raise e

            # unknown operator
            if len(candidate_ops) == 0:
                # print("Missing OPERATOR %s" % obj)
                pass

        # return
        return self

コード例 #10

0

ファイルを表示

    def get(
        self,
        indirect_reference: Union[Reference, int],
        src: io.IOBase,
        tok: HighLevelTokenizer,
    ) -> Optional[AnyPDFType]:

        # cache
        obj = None

        # lookup Reference object for int
        if isinstance(indirect_reference, int) or isinstance(
                indirect_reference, Decimal):
            refs = [
                x for x in self.entries
                if x.object_number == int(indirect_reference)
            ]
            if len(refs) == 0:
                return None
            indirect_reference = refs[0]

        # lookup Reference (in self) for Reference
        elif isinstance(indirect_reference, Reference):
            refs = [
                x for x in self.entries
                if x.object_number == indirect_reference.object_number
            ]
            if len(refs) == 0:
                return None
            indirect_reference = refs[0]

        # reference points to an object that is not in use
        assert isinstance(indirect_reference, Reference)
        if not indirect_reference.is_in_use:
            obj = None

        # the indirect reference may have a byte offset
        if indirect_reference.byte_offset is not None:
            byte_offset = int(indirect_reference.byte_offset)
            tell_before = tok.tell()
            tok.seek(byte_offset)
            obj = tok.read_object(xref=self)
            tok.seek(tell_before)

        # entry specifies a parent object
        if indirect_reference.parent_stream_object_number is not None:

            stream_object = self.get(
                indirect_reference.parent_stream_object_number, src, tok)
            assert isinstance(stream_object, dict)
            if "Length" not in stream_object:
                raise PDFTypeError(expected_type=Union[Decimal, Reference],
                                   received_type=None)

            if "First" not in stream_object:
                raise PDFTypeError(expected_type=Union[Decimal, Reference],
                                   received_type=None)

            # Length may be Reference
            if isinstance(stream_object["Length"], Reference):
                stream_object["Length"] = self.get(stream_object["Length"],
                                                   src=src,
                                                   tok=tok)

            # First may be Reference
            if isinstance(stream_object["First"], Reference):
                stream_object["First"] = self.get(stream_object["First"],
                                                  src=src,
                                                  tok=tok)

            first_byte = int(stream_object.get("First", 0))
            if "DecodedBytes" not in stream_object:
                try:
                    stream_object = decode_stream(stream_object)
                except Exception as ex:
                    logger.debug(
                        "unable to inflate stream for object %d" %
                        indirect_reference.parent_stream_object_number)
                    raise ex
            stream_bytes = stream_object["DecodedBytes"][first_byte:]

            # tokenize parent stream
            index = int(indirect_reference.index_in_parent_stream)
            length = int(stream_object["Length"])
            if index < length:
                tok = HighLevelTokenizer(io.BytesIO(stream_bytes))
                obj = [tok.read_object() for x in range(0, index + 1)]
                obj = obj[-1]
            else:
                obj = None

        # return
        return obj

コード例 #11

0

ファイルを表示

ファイル: canvas.py プロジェクト: pandruszkow-foss-sourcemine/ptext-release

    def read(self, io_source: io.IOBase) -> "Canvas":

        io_source.seek(0, os.SEEK_END)
        length = io_source.tell()
        io_source.seek(0)

        canvas_tokenizer = HighLevelTokenizer(io_source)

        # process content
        operand_stk = []
        while canvas_tokenizer.tell() != length:

            # print("<canvas pos='%d' length='%d' percentage='%d'/>" % ( canvas_tokenizer.tell(), length, int(canvas_tokenizer.tell() * 100 / length)))

            # attempt to read object
            obj = canvas_tokenizer.read_object()
            if obj is None:
                break

            # push argument onto stack
            if not isinstance(obj, CanvasOperatorName):
                operand_stk.append(obj)
                continue

            # process operator
            operator = self.canvas_operators.get(obj, None)
            if operator is None:
                logger.debug("Missing operator %s" % obj)
                continue

            if not self.in_compatibility_section:
                assert len(operand_stk) >= operator.get_number_of_operands()
            operands: typing.List["CanvasOperator"] = []  # type: ignore [name-defined]
            for _ in range(0, operator.get_number_of_operands()):
                operands.insert(0, operand_stk.pop(-1))

            # append
            if "Instructions" not in self:
                self["Instructions"] = List().set_parent(self)  # type: ignore [attr-defined]

            instruction_number = len(self["Instructions"])
            instruction_dictionary = Dictionary()
            instruction_dictionary["Name"] = operator.get_text()
            instruction_dictionary["Args"] = List().set_parent(  # type: ignore [attr-defined]
                instruction_dictionary
            )

            if len(operands) > 0:
                for i in range(0, len(operands)):
                    instruction_dictionary["Args"].append(operands[i])
            self["Instructions"].append(instruction_dictionary)

            # debug
            logger.debug(
                "%d %s %s"
                % (
                    instruction_number,
                    operator.text,
                    str([str(x) for x in operands]),
                )
            )

            # invoke
            try:
                operator.invoke(self, operands)
            except Exception as e:
                if not self.in_compatibility_section:
                    raise e

        # return
        return self