Example #1
0
    def read(self, io_source: io.IOBase) -> "Canvas":
        """
        This method reads a byte stream of canvas operators, and processes them, returning this Canvas afterwards
        """
        io_source.seek(0, os.SEEK_END)
        length = io_source.tell()
        io_source.seek(0)

        canvas_tokenizer = HighLevelTokenizer(io_source)

        # process content
        operand_stk = []
        instruction_number: int = 0
        while canvas_tokenizer.tell() != length:

            # print("<canvas pos='%d' length='%d' percentage='%d'/>" % ( canvas_tokenizer.tell(), length, int(canvas_tokenizer.tell() * 100 / length)))

            # attempt to read object
            obj = canvas_tokenizer.read_object()
            if obj is None:
                break

            # push argument onto stack
            if not isinstance(obj, CanvasOperatorName):
                operand_stk.append(obj)
                continue

            # process operator
            instruction_number += 1
            operator = self.canvas_operators.get(obj, None)
            if operator is None:
                logger.debug("Missing operator %s" % obj)
                continue

            if not self.in_compatibility_section:
                assert len(operand_stk) >= operator.get_number_of_operands()
            operands: typing.List["CanvasOperator"] = []  # type: ignore [name-defined]
            for _ in range(0, operator.get_number_of_operands()):
                operands.insert(0, operand_stk.pop(-1))

            # debug
            operand_str = str([str(x) for x in operands])
            if len(operands) == 1 and isinstance(operands[0], list):
                operand_str = str([str(x) for x in operands[0]])

            logger.debug("%d %s %s" % (instruction_number, operator.text, operand_str))

            # invoke
            try:
                operator.invoke(self, operands)
            except Exception as e:
                if not self.in_compatibility_section:
                    raise e

        # return
        return self
Example #2
0
    def get_object(
        self,
        indirect_reference: Union[Reference, int],
        src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO],
        tok: HighLevelTokenizer,
    ) -> Optional[AnyPDFType]:
        """
        This function looks up an object in this XREF table.
        Objects can be looked up by Reference, or object number.
        """
        # cache
        if (isinstance(indirect_reference, Reference)
                and indirect_reference.parent_stream_object_number is None):
            assert indirect_reference.object_number is not None
            cached_obj = self.cache.get(indirect_reference.object_number, None)
            if cached_obj is not None:
                return cached_obj

        # lookup Reference object for int
        obj = None
        if isinstance(indirect_reference, int) or isinstance(
                indirect_reference, Decimal):
            refs = [
                x for x in self.entries
                if x.object_number == int(indirect_reference)
            ]
            if len(refs) == 0:
                return None
            indirect_reference = refs[0]

        # lookup Reference (in self) for Reference
        elif isinstance(indirect_reference, Reference):
            refs = [
                x for x in self.entries
                if x.object_number == indirect_reference.object_number
            ]
            if len(refs) == 0:
                return None
            indirect_reference = refs[0]

        # reference points to an object that is not in use
        assert isinstance(indirect_reference, Reference)
        if not indirect_reference.is_in_use:
            obj = None

        # the indirect reference may have a byte offset
        if indirect_reference.byte_offset is not None:
            byte_offset = int(indirect_reference.byte_offset)
            tell_before = tok.tell()
            tok.seek(byte_offset)
            obj = tok.read_object(xref=self)
            tok.seek(tell_before)

        # entry specifies a parent object
        if (indirect_reference.parent_stream_object_number is not None
                and indirect_reference.index_in_parent_stream is not None):

            stream_object = self.get_object(
                indirect_reference.parent_stream_object_number, src, tok)
            assert isinstance(stream_object, Stream)
            assert "Length" in stream_object
            assert "First" in stream_object

            # Length may be Reference
            if isinstance(stream_object["Length"], Reference):
                stream_object[Name("Length")] = self.get_object(
                    stream_object["Length"], src=src, tok=tok)

            # First may be Reference
            if isinstance(stream_object["First"], Reference):
                stream_object[Name("First")] = self.get_object(
                    stream_object["First"], src=src, tok=tok)

            first_byte = int(stream_object.get("First", 0))
            if "DecodedBytes" not in stream_object:
                try:
                    stream_object = decode_stream(stream_object)
                except Exception as ex:
                    logger.debug(
                        "unable to inflate stream for object %d" %
                        indirect_reference.parent_stream_object_number)
                    raise ex
            stream_bytes = stream_object["DecodedBytes"][first_byte:]

            # tokenize parent stream
            index = int(indirect_reference.index_in_parent_stream)
            length = int(stream_object["Length"])
            if index < length:
                tok = HighLevelTokenizer(io.BytesIO(stream_bytes))
                list_of_objs = [tok.read_object() for x in range(0, index + 1)]
                obj = list_of_objs[-1]
            else:
                obj = None

        # update cache
        if indirect_reference.parent_stream_object_number is None:
            assert indirect_reference.object_number is not None
            self.cache[indirect_reference.object_number] = obj

        # return
        return obj
Example #3
0
    def read(self, cmap_bytes: str) -> "CMap":

        N = len(cmap_bytes)
        tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1")))

        prev_token: Optional[Token] = None
        while tok.tell() < N:

            token = tok.next_non_comment_token()
            if token is None:
                break

            # beginbfchar
            if token.text == "beginbfchar":
                assert prev_token is not None
                n = int(prev_token.text)
                for j in range(0, n):
                    obj = tok.read_object()
                    assert isinstance(obj, HexadecimalString)
                    c = self._hex_string_to_int_or_tuple(obj)
                    assert isinstance(c, int)

                    obj = tok.read_object()
                    assert isinstance(obj, HexadecimalString)
                    uc = self._hex_string_to_int_or_tuple(obj)

                    self._add_symbol(c, uc)
                continue

            # beginbfrange
            if token.text == "beginbfrange":
                assert prev_token is not None
                n = int(prev_token.text)
                for j in range(0, n):

                    c_start_token = tok.read_object()
                    assert c_start_token is not None
                    assert isinstance(c_start_token, HexadecimalString)
                    c_start = int(str(c_start_token), 16)

                    c_end_token = tok.read_object()
                    assert c_end_token is not None
                    assert isinstance(c_end_token, HexadecimalString)
                    c_end = int(str(c_end_token), 16)

                    tmp = tok.read_object()
                    if isinstance(tmp, HexadecimalString):
                        uc = self._hex_string_to_int_or_tuple(tmp)
                        for k in range(0, c_end - c_start + 1):
                            if isinstance(uc, int):
                                self._add_symbol(c_start + k, uc + k)
                            elif isinstance(uc, tuple):
                                self._add_symbol(c_start + k,
                                                 (uc[0], uc[1] + k))

                    elif isinstance(tmp, list):
                        for k in range(0, c_end - c_start + 1):
                            uc = self._hex_string_to_int_or_tuple(tmp[k])
                            self._add_symbol(c_start + k, uc)

            # default
            prev_token = token

        return self