def _read_trailer(self, src: io.IOBase, tok: HighLevelTokenizer) -> Dictionary: # return None if there is no trailer token = tok.next_non_comment_token() assert token is not None if token.text != "trailer": return Dictionary() # if there is a keyword "trailer" the next token should be TokenType.START_DICT token = tok.next_non_comment_token() assert token is not None if token.token_type != TokenType.START_DICT: raise PDFSyntaxError( byte_offset=tok.tell(), message="invalid XREF trailer", ) # go back 2 chars "<<" src.seek(-2, io.SEEK_CUR) # read dictionary as trailer trailer_dict = tok.read_dictionary() # process startxref token = tok.next_non_comment_token() assert token is not None if token.token_type != TokenType.OTHER or token.text != "startxref": raise PDFSyntaxError( byte_offset=token.byte_offset, message="start of XREF not found", ) # return return trailer_dict
def _seek_to_xref_token(self, src: io.IOBase, tok: HighLevelTokenizer): # find "startxref" text start_of_xref_token_byte_offset = self._find_backwards( src, tok, "startxref") assert start_of_xref_token_byte_offset is not None if start_of_xref_token_byte_offset == -1: raise StartXREFTokenNotFoundError() # set tokenizer to "startxref" src.seek(start_of_xref_token_byte_offset) token = tok.next_non_comment_token() assert token is not None if token.text == "xref": src.seek(start_of_xref_token_byte_offset) return # if we are at startxref, we are reading the XREF table backwards # and we need to go back to the start of XREF if token.text == "startxref": token = tok.next_non_comment_token() assert token is not None if token.token_type != TokenType.NUMBER: raise PDFSyntaxError(byte_offset=token.byte_offset, message="invalid XREF") start_of_xref_offset = int(token.text) src.seek(start_of_xref_offset)
def _read_section(self, src: io.IOBase, tok: HighLevelTokenizer) -> List[Reference]: tokens = [tok.next_non_comment_token() for _ in range(0, 2)] assert tokens[0] is not None assert tokens[1] is not None if tokens[0].text in ["trailer", "startxref"]: src.seek(tokens[0].byte_offset) return [] if tokens[0].token_type != TokenType.NUMBER: raise PDFValueError( byte_offset=tokens[0].byte_offset, expected_value_description="number", received_value_description=tokens[0].text, ) if tokens[1].token_type != TokenType.NUMBER: raise PDFValueError( byte_offset=tokens[1].byte_offset, expected_value_description="number", received_value_description=tokens[1].text, ) start_object_number = int(tokens[0].text) number_of_objects = int(tokens[1].text) indirect_references = [] # read subsection for i in range(0, number_of_objects): tokens = [tok.next_non_comment_token() for _ in range(0, 3)] assert tokens[0] is not None assert tokens[1] is not None assert tokens[2] is not None if tokens[0].text in ["trailer", "startxref"]: raise PDFSyntaxError( byte_offset=tokens[0].byte_offset, message="unexpected EOF while processing XREF", ) if (tokens[0].token_type != TokenType.NUMBER or tokens[1].token_type != TokenType.NUMBER or tokens[2].token_type != TokenType.OTHER or tokens[2].text not in ["f", "n"]): raise PDFSyntaxError( byte_offset=tokens[0].byte_offset, message="invalid XREF line", ) indirect_references.append( Reference( object_number=start_object_number + i, byte_offset=int(tokens[0].text), generation_number=int(tokens[1].text), is_in_use=(tokens[2].text == "n"), )) # return return indirect_references
def _find_backwards( self, src: io.IOBase, tok: HighLevelTokenizer, text_to_find: str, ) -> int: # length of str to check str_len = 1024 # go to end of file src.seek(0, io.SEEK_END) file_length = src.tell() pos = file_length - str_len if pos < 1: pos = 1 while pos > 0: src.seek(pos) bytes_near_eof = "".join( [tok._next_char() for _ in range(0, str_len)]) idx = bytes_near_eof.find(text_to_find) if idx >= 0: return pos + idx pos = pos - str_len + len(text_to_find) # raise error return -1
def read( self, src: io.IOBase, tok: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": if initial_offset is not None: src.seek(initial_offset) else: self._seek_to_xref_token(src, tok) # now we should be back to the start of XREF token = tok.next_non_comment_token() assert token is not None if token.text != "xref": raise XREFTokenNotFoundError() # read xref sections while True: xref_section = self._read_section(src, tok) if len(xref_section) == 0: break else: for r in xref_section: self.append(r) # process trailer self["Trailer"] = self._read_trailer(src, tok) # return self return self
def read(self, cmap_bytes: str) -> "CMap": N = len(cmap_bytes) tok = HighLevelTokenizer(io.BytesIO(cmap_bytes.encode("latin-1"))) prev_token = None while tok.tell() < N: token = tok.next_non_comment_token() if token is None: break # beginbfchar if token.text == "beginbfchar": n = int(prev_token.text) for j in range(0, n): c = self._hex_string_to_int_or_tuple(tok.read_object()) uc = self._hex_string_to_int_or_tuple(tok.read_object()) self._add_symbol(c, uc) continue # beginbfrange if token.text == "beginbfrange": n = int(prev_token.text) for j in range(0, n): c_start_token = tok.read_object() c_start = int(c_start_token, 16) c_end_token = tok.read_object() c_end = int(c_end_token, 16) tmp = tok.read_object() if isinstance(tmp, HexadecimalString): uc = self._hex_string_to_int_or_tuple(tmp) for k in range(0, c_end - c_start + 1): if isinstance(uc, int): self._add_symbol(c_start + k, uc + k) elif isinstance(uc, tuple): self._add_symbol(c_start + k, (uc[0], uc[1] + k)) elif isinstance(tmp, list): for k in range(0, c_end - c_start + 1): uc = self._hex_string_to_int_or_tuple(tmp[k]) self._add_symbol(c_start + k, uc) # default prev_token = token return self
def transform( self, object_to_transform: Union[io.BufferedIOBase, io.RawIOBase, AnyPDFType], parent_object: Any, context: Optional[TransformerContext] = None, event_listeners: typing.List[EventListener] = [], ) -> Any: # update context assert context is not None assert isinstance(object_to_transform, io.BufferedIOBase) or isinstance( object_to_transform, io.RawIOBase) context.root_object = Document() context.source = object_to_transform context.tokenizer = HighLevelTokenizer(context.source) # add listener(s) for l in event_listeners: context.root_object.add_event_listener(l) # remove prefix self._remove_prefix(context) # check header self._check_header(context) # file size context.source.seek(0, os.SEEK_END) file_length = context.source.tell() context.source.seek(0) context.root_object["FileSize"] = Decimal(file_length) # build XREF object self._read_xref(context) # transform trailer dictionary xref = context.root_object.get("XRef") if "Trailer" in xref and "Encrypt" in xref["Trailer"]: # TODO raise NotImplementedError( "password-protected PDFs are currently not supported") trailer = self.get_root_transformer().transform( context.root_object["XRef"]["Trailer"], context.root_object, context, [], ) xref["Trailer"] = trailer for k in ["DecodeParms", "Filter", "Index", "Length", "Prev", "W"]: if k in xref["Trailer"]: xref["Trailer"].pop(k) # return return context.root_object
def read( self, io_source: Union[io.BufferedIOBase, io.RawIOBase], tokenizer: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": if initial_offset is not None: io_source.seek(initial_offset) else: self._seek_to_xref_token(io_source, tokenizer) xref_stream = tokenizer.read_object() assert isinstance(xref_stream, Stream) # check widths assert "W" in xref_stream assert all([ isinstance(xref_stream["W"][x], Decimal) for x in range(0, len(xref_stream["W"])) ]) # decode widths widths = [ int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"])) ] total_entry_width = sum(widths) # parent document = self.get_root() # type: ignore [attr-defined] # list of references indirect_references = [ Reference( object_number=0, generation_number=65535, is_in_use=False, document=document, ) ] # check size assert "Size" in xref_stream assert isinstance(xref_stream["Size"], Decimal) # get size number_of_objects = int(xref_stream["Size"]) # index index = [] if "Index" in xref_stream: index = xref_stream["Index"] assert isinstance(index, List) assert len(index) % 2 == 0 assert isinstance(index[0], Decimal) assert isinstance(index[1], Decimal) else: index = [Decimal(0), Decimal(number_of_objects)] # apply filters xref_stream = decode_stream(xref_stream) # read every range specified in \Index xref_stream_decoded_bytes = xref_stream["DecodedBytes"] for idx in range(0, len(index), 2): start = int(index[idx]) length = int(index[idx + 1]) bptr = 0 for i in range(0, length): # object number object_number = start + i # read type type = 1 if widths[0] > 0: type = 0 for j in range(0, widths[0]): type = (type << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 2 field2 = 0 for j in range(0, widths[1]): field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 3 field3 = 0 for j in range(0, widths[2]): field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # check type assert type in [0, 1, 2] pdf_indirect_reference = None if type == 0: # type :The type of this entry, which shall be 0. Type 0 entries define # the linked list of free objects (corresponding to f entries in a # cross-reference table). # field2 : The object number of the next free object # field3 : The generation number to use if this object number is used again pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, is_in_use=False, ) if type == 1: # Type : The type of this entry, which shall be 1. Type 1 entries define # objects that are in use but are not compressed (corresponding # to n entries in a cross-reference table). # field2 : The byte offset of the object, starting from the beginning of the # file. # field3 : The generation number of the object. Default value: 0. pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, ) if type == 2: # Type : The type of this entry, which shall be 2. Type 2 entries define # compressed objects. # field2 : The object number of the object stream in which this object is # stored. (The generation number of the object stream shall be # implicitly 0.) # field3 : The index of this object within the object stream. pdf_indirect_reference = Reference( document=document, object_number=object_number, generation_number=0, parent_stream_object_number=field2, index_in_parent_stream=field3, ) assert pdf_indirect_reference is not None # append existing_indirect_ref = next( iter([ x for x in indirect_references if x.object_number is not None and x.object_number == Decimal(object_number) ]), None, ) ref_is_in_reading_state = ( existing_indirect_ref is not None and existing_indirect_ref.is_in_use and existing_indirect_ref.generation_number == pdf_indirect_reference.generation_number) ref_is_first_encountered = existing_indirect_ref is None or ( not ref_is_in_reading_state and existing_indirect_ref.document is None) if ref_is_first_encountered: assert pdf_indirect_reference is not None indirect_references.append(pdf_indirect_reference) elif ref_is_in_reading_state: assert existing_indirect_ref is not None assert pdf_indirect_reference is not None existing_indirect_ref.index_in_parent_stream = ( pdf_indirect_reference.index_in_parent_stream) existing_indirect_ref.parent_stream_object_number = ( pdf_indirect_reference.parent_stream_object_number) # add section for r in indirect_references: self.append(r) # initialize trailer self["Trailer"] = Dictionary(xref_stream) # return return self
def read(self, io_source: io.IOBase) -> "Canvas": io_source.seek(0, os.SEEK_END) length = io_source.tell() io_source.seek(0) canvas_tokenizer = HighLevelTokenizer(io_source) # process content operand_stk = [] while canvas_tokenizer.tell() != length: # attempt to read object obj = canvas_tokenizer.read_object() if obj is None: break # push argument onto stack if not isinstance(obj, CanvasOperatorName): operand_stk.append(obj) continue # process operator candidate_ops = [ x for x in self.canvas_operators if x.get_text() == str(obj) ] if len(candidate_ops) == 1: operator = candidate_ops[0] if len(operand_stk) < operator.get_number_of_operands(): # if we are in a compatibility section ignore any possible mistake if self.in_compatibility_section: continue raise IllegalGraphicsStateError( message="Unable to execute operator %s. Expected %d arguments, received %d." % ( operator.text, operator.get_number_of_operands(), len(operand_stk), ) ) operands = [] for _ in range(0, operator.get_number_of_operands()): operands.insert(0, operand_stk.pop(-1)) # append if "Instructions" not in self: self["Instructions"] = List().set_parent(self) instruction_number = len(self["Instructions"]) instruction_dictionary = Dictionary() instruction_dictionary["Name"] = operator.get_text() instruction_dictionary["Args"] = List().set_parent( instruction_dictionary ) if len(operands) > 0: for i in range(0, len(operands)): instruction_dictionary["Args"].append(operands[i]) self["Instructions"].append(instruction_dictionary) # debug logger.debug( "%d %s %s" % ( instruction_number, operator.text, str([str(x) for x in operands]), ) ) # invoke try: operator.invoke(self, operands) except Exception as e: if not self.in_compatibility_section: raise e # unknown operator if len(candidate_ops) == 0: # print("Missing OPERATOR %s" % obj) pass # return return self
def get( self, indirect_reference: Union[Reference, int], src: io.IOBase, tok: HighLevelTokenizer, ) -> Optional[AnyPDFType]: # cache obj = None # lookup Reference object for int if isinstance(indirect_reference, int) or isinstance( indirect_reference, Decimal): refs = [ x for x in self.entries if x.object_number == int(indirect_reference) ] if len(refs) == 0: return None indirect_reference = refs[0] # lookup Reference (in self) for Reference elif isinstance(indirect_reference, Reference): refs = [ x for x in self.entries if x.object_number == indirect_reference.object_number ] if len(refs) == 0: return None indirect_reference = refs[0] # reference points to an object that is not in use assert isinstance(indirect_reference, Reference) if not indirect_reference.is_in_use: obj = None # the indirect reference may have a byte offset if indirect_reference.byte_offset is not None: byte_offset = int(indirect_reference.byte_offset) tell_before = tok.tell() tok.seek(byte_offset) obj = tok.read_object(xref=self) tok.seek(tell_before) # entry specifies a parent object if indirect_reference.parent_stream_object_number is not None: stream_object = self.get( indirect_reference.parent_stream_object_number, src, tok) assert isinstance(stream_object, dict) if "Length" not in stream_object: raise PDFTypeError(expected_type=Union[Decimal, Reference], received_type=None) if "First" not in stream_object: raise PDFTypeError(expected_type=Union[Decimal, Reference], received_type=None) # Length may be Reference if isinstance(stream_object["Length"], Reference): stream_object["Length"] = self.get(stream_object["Length"], src=src, tok=tok) # First may be Reference if isinstance(stream_object["First"], Reference): stream_object["First"] = self.get(stream_object["First"], src=src, tok=tok) first_byte = int(stream_object.get("First", 0)) if "DecodedBytes" not in stream_object: try: stream_object = decode_stream(stream_object) except Exception as ex: logger.debug( "unable to inflate stream for object %d" % indirect_reference.parent_stream_object_number) raise ex stream_bytes = stream_object["DecodedBytes"][first_byte:] # tokenize parent stream index = int(indirect_reference.index_in_parent_stream) length = int(stream_object["Length"]) if index < length: tok = HighLevelTokenizer(io.BytesIO(stream_bytes)) obj = [tok.read_object() for x in range(0, index + 1)] obj = obj[-1] else: obj = None # return return obj
def read(self, io_source: io.IOBase) -> "Canvas": io_source.seek(0, os.SEEK_END) length = io_source.tell() io_source.seek(0) canvas_tokenizer = HighLevelTokenizer(io_source) # process content operand_stk = [] while canvas_tokenizer.tell() != length: # print("<canvas pos='%d' length='%d' percentage='%d'/>" % ( canvas_tokenizer.tell(), length, int(canvas_tokenizer.tell() * 100 / length))) # attempt to read object obj = canvas_tokenizer.read_object() if obj is None: break # push argument onto stack if not isinstance(obj, CanvasOperatorName): operand_stk.append(obj) continue # process operator operator = self.canvas_operators.get(obj, None) if operator is None: logger.debug("Missing operator %s" % obj) continue if not self.in_compatibility_section: assert len(operand_stk) >= operator.get_number_of_operands() operands: typing.List["CanvasOperator"] = [] # type: ignore [name-defined] for _ in range(0, operator.get_number_of_operands()): operands.insert(0, operand_stk.pop(-1)) # append if "Instructions" not in self: self["Instructions"] = List().set_parent(self) # type: ignore [attr-defined] instruction_number = len(self["Instructions"]) instruction_dictionary = Dictionary() instruction_dictionary["Name"] = operator.get_text() instruction_dictionary["Args"] = List().set_parent( # type: ignore [attr-defined] instruction_dictionary ) if len(operands) > 0: for i in range(0, len(operands)): instruction_dictionary["Args"].append(operands[i]) self["Instructions"].append(instruction_dictionary) # debug logger.debug( "%d %s %s" % ( instruction_number, operator.text, str([str(x) for x in operands]), ) ) # invoke try: operator.invoke(self, operands) except Exception as e: if not self.in_compatibility_section: raise e # return return self