def _seek_to_xref_token( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ): # find "startxref" text start_of_xref_token_byte_offset = self._find_backwards( src, tok, "startxref") assert start_of_xref_token_byte_offset is not None assert start_of_xref_token_byte_offset != -1 # set tokenizer to "startxref" src.seek(start_of_xref_token_byte_offset) token = tok.next_non_comment_token() assert token is not None if token.text == "xref": src.seek(start_of_xref_token_byte_offset) return # if we are at startxref, we are reading the XREF table backwards # and we need to go back to the start of XREF if token.text == "startxref": token = tok.next_non_comment_token() assert token is not None assert token.token_type == TokenType.NUMBER start_of_xref_offset = int(token.text) src.seek(start_of_xref_offset)
def _read_trailer( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ) -> Dictionary: # return None if there is no trailer token = tok.next_non_comment_token() assert token is not None if token.text != "trailer": return Dictionary() # if there is a keyword "trailer" the next token should be TokenType.START_DICT token = tok.next_non_comment_token() assert token is not None assert token.token_type == TokenType.START_DICT # go back 2 chars "<<" src.seek(-2, io.SEEK_CUR) # read dictionary as trailer trailer_dict = tok.read_dictionary() # process startxref token = tok.next_non_comment_token() assert token is not None assert token.token_type == TokenType.OTHER assert token.text == "startxref" # return return trailer_dict
def read(self, io_source: io.IOBase) -> "Canvas": """ This method reads a byte stream of canvas operators, and processes them, returning this Canvas afterwards """ io_source.seek(0, os.SEEK_END) length = io_source.tell() io_source.seek(0) canvas_tokenizer = HighLevelTokenizer(io_source) # process content operand_stk = [] instruction_number: int = 0 while canvas_tokenizer.tell() != length: # print("<canvas pos='%d' length='%d' percentage='%d'/>" % ( canvas_tokenizer.tell(), length, int(canvas_tokenizer.tell() * 100 / length))) # attempt to read object obj = canvas_tokenizer.read_object() if obj is None: break # push argument onto stack if not isinstance(obj, CanvasOperatorName): operand_stk.append(obj) continue # process operator instruction_number += 1 operator = self.canvas_operators.get(obj, None) if operator is None: logger.debug("Missing operator %s" % obj) continue if not self.in_compatibility_section: assert len(operand_stk) >= operator.get_number_of_operands() operands: typing.List["CanvasOperator"] = [] # type: ignore [name-defined] for _ in range(0, operator.get_number_of_operands()): operands.insert(0, operand_stk.pop(-1)) # debug operand_str = str([str(x) for x in operands]) if len(operands) == 1 and isinstance(operands[0], list): operand_str = str([str(x) for x in operands[0]]) logger.debug("%d %s %s" % (instruction_number, operator.text, operand_str)) # invoke try: operator.invoke(self, operands) except Exception as e: if not self.in_compatibility_section: raise e # return return self
def _find_backwards( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, text_to_find: str, ) -> int: # length of str to check str_len = 1024 # go to end of file src.seek(0, io.SEEK_END) file_length = src.tell() pos = file_length - str_len if pos < 1: pos = 1 while pos > 0: src.seek(pos) bytes_near_eof = "".join( [tok._next_char() for _ in range(0, str_len)]) idx = bytes_near_eof.find(text_to_find) if idx >= 0: return pos + idx pos = pos - str_len + len(text_to_find) # raise error return -1
def read( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": """ This method attempts to read a plaintext XREF from the given io_source. It will either throw an exception, or return this XREF """ if initial_offset is not None: src.seek(initial_offset) else: self._seek_to_xref_token(src, tok) # now we should be back to the start of XREF token = tok.next_non_comment_token() assert token is not None assert token.text == "xref" # read xref sections while True: xref_section = self._read_section(src, tok) if len(xref_section) == 0: break else: for r in xref_section: self.append(r) # process trailer self[Name("Trailer")] = self._read_trailer(src, tok) # return self return self
def _read_section( self, src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ) -> List[Reference]: tokens = [tok.next_non_comment_token() for _ in range(0, 2)] assert tokens[0] is not None assert tokens[1] is not None if tokens[0].text in ["trailer", "startxref"]: src.seek(tokens[0].byte_offset) return [] assert tokens[0].token_type == TokenType.NUMBER assert tokens[1].token_type == TokenType.NUMBER start_object_number = int(tokens[0].text) number_of_objects = int(tokens[1].text) indirect_references = [] # read subsection for i in range(0, number_of_objects): tokens = [tok.next_non_comment_token() for _ in range(0, 3)] assert tokens[0] is not None assert tokens[0].text not in ["trailer", "startxref"] assert tokens[0].token_type == TokenType.NUMBER assert tokens[1] is not None assert tokens[1].token_type == TokenType.NUMBER assert tokens[2] is not None assert tokens[2].token_type == TokenType.OTHER assert tokens[2].text in ["f", "n"] indirect_references.append( Reference( object_number=start_object_number + i, byte_offset=int(tokens[0].text), generation_number=int(tokens[1].text), is_in_use=(tokens[2].text == "n"), )) # return return indirect_references
def transform( self, object_to_transform: Union[io.BufferedIOBase, io.RawIOBase, AnyPDFType], parent_object: Any, context: Optional[ReadTransformerContext] = None, event_listeners: typing.List[EventListener] = [], ) -> Any: # update context assert context is not None assert isinstance(object_to_transform, io.BufferedIOBase) or isinstance( object_to_transform, io.RawIOBase) context.root_object = Document() context.source = object_to_transform context.tokenizer = HighLevelTokenizer(context.source) # add listener(s) for l in event_listeners: context.root_object.add_event_listener( l) # type: ignore [attr-defined] # remove prefix ReadXREFTransformer._remove_prefix(context) # check header ReadXREFTransformer._check_header(context) # file size context.source.seek(0, os.SEEK_END) file_length = context.source.tell() context.source.seek(0) context.root_object[Name("FileSize")] = Decimal(file_length) # build XREF object self._read_xref(context) # transform trailer dictionary xref = context.root_object.get("XRef") assert xref is not None assert isinstance(xref, XREF) # check for password protected PDF if "Trailer" in xref and "Encrypt" in xref["Trailer"]: # TODO raise NotImplementedError( "password-protected PDFs are currently not supported") # transform \Trailer trailer = self.get_root_transformer().transform( context.root_object["XRef"]["Trailer"], context.root_object, context, [], ) assert trailer is not None assert isinstance(trailer, Dictionary) xref[Name("Trailer")] = trailer for k in ["DecodeParms", "Filter", "Index", "Length", "Prev", "W"]: if k in xref["Trailer"]: xref["Trailer"].pop(k) # return return context.root_object
def get_object( self, indirect_reference: Union[Reference, int], src: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tok: HighLevelTokenizer, ) -> Optional[AnyPDFType]: """ This function looks up an object in this XREF table. Objects can be looked up by Reference, or object number. """ # cache if (isinstance(indirect_reference, Reference) and indirect_reference.parent_stream_object_number is None): assert indirect_reference.object_number is not None cached_obj = self.cache.get(indirect_reference.object_number, None) if cached_obj is not None: return cached_obj # lookup Reference object for int obj = None if isinstance(indirect_reference, int) or isinstance( indirect_reference, Decimal): refs = [ x for x in self.entries if x.object_number == int(indirect_reference) ] if len(refs) == 0: return None indirect_reference = refs[0] # lookup Reference (in self) for Reference elif isinstance(indirect_reference, Reference): refs = [ x for x in self.entries if x.object_number == indirect_reference.object_number ] if len(refs) == 0: return None indirect_reference = refs[0] # reference points to an object that is not in use assert isinstance(indirect_reference, Reference) if not indirect_reference.is_in_use: obj = None # the indirect reference may have a byte offset if indirect_reference.byte_offset is not None: byte_offset = int(indirect_reference.byte_offset) tell_before = tok.tell() tok.seek(byte_offset) obj = tok.read_object(xref=self) tok.seek(tell_before) # entry specifies a parent object if (indirect_reference.parent_stream_object_number is not None and indirect_reference.index_in_parent_stream is not None): stream_object = self.get_object( indirect_reference.parent_stream_object_number, src, tok) assert isinstance(stream_object, Stream) assert "Length" in stream_object assert "First" in stream_object # Length may be Reference if isinstance(stream_object["Length"], Reference): stream_object[Name("Length")] = self.get_object( stream_object["Length"], src=src, tok=tok) # First may be Reference if isinstance(stream_object["First"], Reference): stream_object[Name("First")] = self.get_object( stream_object["First"], src=src, tok=tok) first_byte = int(stream_object.get("First", 0)) if "DecodedBytes" not in stream_object: try: stream_object = decode_stream(stream_object) except Exception as ex: logger.debug( "unable to inflate stream for object %d" % indirect_reference.parent_stream_object_number) raise ex stream_bytes = stream_object["DecodedBytes"][first_byte:] # tokenize parent stream index = int(indirect_reference.index_in_parent_stream) length = int(stream_object["Length"]) if index < length: tok = HighLevelTokenizer(io.BytesIO(stream_bytes)) list_of_objs = [tok.read_object() for x in range(0, index + 1)] obj = list_of_objs[-1] else: obj = None # update cache if indirect_reference.parent_stream_object_number is None: assert indirect_reference.object_number is not None self.cache[indirect_reference.object_number] = obj # return return obj
def read(self, cmap_bytes: str) -> "CMap": N = len(cmap_bytes) tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1"))) prev_token: Optional[Token] = None while tok.tell() < N: token = tok.next_non_comment_token() if token is None: break # beginbfchar if token.text == "beginbfchar": assert prev_token is not None n = int(prev_token.text) for j in range(0, n): obj = tok.read_object() assert isinstance(obj, HexadecimalString) c = self._hex_string_to_int_or_tuple(obj) assert isinstance(c, int) obj = tok.read_object() assert isinstance(obj, HexadecimalString) uc = self._hex_string_to_int_or_tuple(obj) self._add_symbol(c, uc) continue # beginbfrange if token.text == "beginbfrange": assert prev_token is not None n = int(prev_token.text) for j in range(0, n): c_start_token = tok.read_object() assert c_start_token is not None assert isinstance(c_start_token, HexadecimalString) c_start = int(str(c_start_token), 16) c_end_token = tok.read_object() assert c_end_token is not None assert isinstance(c_end_token, HexadecimalString) c_end = int(str(c_end_token), 16) tmp = tok.read_object() if isinstance(tmp, HexadecimalString): uc = self._hex_string_to_int_or_tuple(tmp) for k in range(0, c_end - c_start + 1): if isinstance(uc, int): self._add_symbol(c_start + k, uc + k) elif isinstance(uc, tuple): self._add_symbol(c_start + k, (uc[0], uc[1] + k)) elif isinstance(tmp, list): for k in range(0, c_end - c_start + 1): uc = self._hex_string_to_int_or_tuple(tmp[k]) self._add_symbol(c_start + k, uc) # default prev_token = token return self
def _read_cmap(cmap_bytes: bytes) -> typing.Dict[int, str]: out_map: typing.Dict[int, str] = {} cmap_tokenizer: HighLevelTokenizer = HighLevelTokenizer(io.BytesIO(cmap_bytes)) # process stream prev_token: typing.Optional[Token] = None number_of_bytes = len(cmap_bytes) while cmap_tokenizer.tell() < number_of_bytes: token: typing.Optional[Token] = cmap_tokenizer.next_non_comment_token() assert token is not None # beginbfchar if token.text == "beginbfchar": assert prev_token is not None number_of_lines_001: int = int(prev_token.text) for _ in range(0, number_of_lines_001): token = cmap_tokenizer.next_non_comment_token() assert token is not None char_code: int = int(token.text[1:-1], 16) token = cmap_tokenizer.next_non_comment_token() assert token is not None unicode_str: str = str(token.text)[1:-1].replace(" ","") unicode_str = "".join( [ chr(int(unicode_str[j: j + 4], 16)) for j in range(0, len(unicode_str), 4) ] ) out_map[char_code] = unicode_str if token.text == "begincidrange": assert prev_token is not None number_of_lines_002: int = int(prev_token.text) for _ in range(0, number_of_lines_002): token = cmap_tokenizer.next_non_comment_token() assert token is not None char_code_start_002: int = int(token.text[1:-1], 16) token = cmap_tokenizer.next_non_comment_token() assert token is not None char_code_stop_002: int = int(token.text[1:-1], 16) token = cmap_tokenizer.next_non_comment_token() assert token is not None # <FFFF> <FFFF> 0000 if token.token_type == TokenType.NUMBER: unicode_base: int = int(token.text) for i in range(char_code_start_002, char_code_stop_002 + 1): out_map[i] = chr(unicode_base + (i - char_code_start_002)) # beginbfrange if token.text == "beginbfrange": assert prev_token is not None number_of_lines_003: int = int(prev_token.text) for _ in range(0, number_of_lines_003): token = cmap_tokenizer.next_non_comment_token() assert token is not None char_code_start_003: int = int(token.text[1:-1], 16) token = cmap_tokenizer.next_non_comment_token() assert token is not None char_code_stop_003: int = int(token.text[1:-1], 16) token = cmap_tokenizer.next_non_comment_token() assert token is not None # <FFFF> <FFFF> <FFFF> if token.token_type == TokenType.HEX_STRING: unicode_base_str: str = str(token.text)[1:-1] for i in range(char_code_start_003, char_code_stop_003 + 1): unicode_hex: str = hex(int(unicode_base_str, 16) + (i - char_code_start_003))[2:] unicode_hex = ("".join(["0" for _ in range(0, 4 - len(unicode_hex) % 4)]) + unicode_hex) unicode_hex = "".join([chr(int(unicode_hex[j: j + 4], 16)) for j in range(0, len(unicode_hex), 4)]) out_map[i] = unicode_hex continue # <FFFF> <FFFF> [ <FFFF>* ] if token.token_type == TokenType.START_ARRAY: for i in range(char_code_start_003, char_code_stop_003 + 1): token = cmap_tokenizer.next_non_comment_token() assert token is not None unicode_base_str_003: str = str(token.text)[1:-1] unicode_hex = ("".join(["0" for _ in range(0, 4 - len(unicode_base_str_003) % 4)]) + unicode_base_str) unicode_hex = "".join([chr(int(unicode_hex[j: j + 4], 16)) for j in range(0, len(unicode_hex), 4)]) out_map[i] = unicode_hex # read END_ARRAY token = cmap_tokenizer.next_non_comment_token() assert token is not None assert token.token_type == TokenType.END_ARRAY continue # set previous token prev_token = token # return return out_map
def read( self, io_source: Union[io.BufferedIOBase, io.RawIOBase, io.BytesIO], tokenizer: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": """ This method attempts to read a stream XREF from the given io_source. It will either throw an exception, or return this XREF """ if initial_offset is not None: io_source.seek(initial_offset) else: self._seek_to_xref_token(io_source, tokenizer) xref_stream = tokenizer.read_object() assert isinstance(xref_stream, Stream) # check widths assert "W" in xref_stream assert all([ isinstance(xref_stream["W"][x], Decimal) for x in range(0, len(xref_stream["W"])) ]) # decode widths widths = [ int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"])) ] total_entry_width = sum(widths) # parent document = self.get_root() # type: ignore [attr-defined] # list of references indirect_references = [ Reference( object_number=0, generation_number=65535, is_in_use=False, document=document, ) ] # check size assert "Size" in xref_stream assert isinstance(xref_stream["Size"], Decimal) # get size number_of_objects = int(xref_stream["Size"]) # index index = [] if "Index" in xref_stream: index = xref_stream["Index"] assert isinstance(index, List) assert len(index) % 2 == 0 assert isinstance(index[0], Decimal) assert isinstance(index[1], Decimal) else: index = [Decimal(0), Decimal(number_of_objects)] # apply filters xref_stream = decode_stream(xref_stream) # read every range specified in \Index xref_stream_decoded_bytes = xref_stream["DecodedBytes"] for idx in range(0, len(index), 2): start = int(index[idx]) length = int(index[idx + 1]) bptr = 0 for i in range(0, length): # object number object_number = start + i # read type type = 1 if widths[0] > 0: type = 0 for j in range(0, widths[0]): type = (type << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 2 field2 = 0 for j in range(0, widths[1]): field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 3 field3 = 0 for j in range(0, widths[2]): field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # check type assert type in [0, 1, 2] pdf_indirect_reference = None if type == 0: # type :The type of this entry, which shall be 0. Type 0 entries define # the linked list of free objects (corresponding to f entries in a # cross-reference table). # field2 : The object number of the next free object # field3 : The generation number to use if this object number is used again pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, is_in_use=False, ) if type == 1: # Type : The type of this entry, which shall be 1. Type 1 entries define # objects that are in use but are not compressed (corresponding # to n entries in a cross-reference table). # field2 : The byte offset of the object, starting from the beginning of the # file. # field3 : The generation number of the object. Default value: 0. pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, ) if type == 2: # Type : The type of this entry, which shall be 2. Type 2 entries define # compressed objects. # field2 : The object number of the object stream in which this object is # stored. (The generation number of the object stream shall be # implicitly 0.) # field3 : The index of this object within the object stream. pdf_indirect_reference = Reference( document=document, object_number=object_number, generation_number=0, parent_stream_object_number=field2, index_in_parent_stream=field3, ) assert pdf_indirect_reference is not None # append existing_indirect_ref = next( iter([ x for x in indirect_references if x.object_number is not None and x.object_number == Decimal(object_number) ]), None, ) ref_is_in_reading_state = ( existing_indirect_ref is not None and existing_indirect_ref.is_in_use and existing_indirect_ref.generation_number == pdf_indirect_reference.generation_number) ref_is_first_encountered = existing_indirect_ref is None or ( not ref_is_in_reading_state and existing_indirect_ref.document is None) if ref_is_first_encountered: assert pdf_indirect_reference is not None indirect_references.append(pdf_indirect_reference) elif ref_is_in_reading_state: assert existing_indirect_ref is not None assert pdf_indirect_reference is not None existing_indirect_ref.index_in_parent_stream = ( pdf_indirect_reference.index_in_parent_stream) existing_indirect_ref.parent_stream_object_number = ( pdf_indirect_reference.parent_stream_object_number) # add section for r in indirect_references: self.append(r) # initialize trailer self[Name("Trailer")] = Dictionary() for k, v in xref_stream.items(): self[Name("Trailer")][k] = v self[Name("Trailer")].set_parent(self[Name("Trailer")]) # return return self