def test04(self): """Test token seek and tell.""" filepath = os.path.join(TokenStreamTest.path, 'obj_stream3.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(98, tok.data) # Memorize position after doing a first next_token(), this works xpos = tk.tell() tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(73, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(5, tok.data) tk.seek(xpos) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(73, tok.data)
def test05(self): """Test error from flex & bison book.""" filepath = os.path.join(TokenStreamTest.path, 'flex0.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) s = tok.data.decode('unicode_escape') self.assertEqual('Antenna', s[:7])
def test_literal(self): """Test literal strings.""" filepath = os.path.join(TokenStreamTest.path, 'literal.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(17, len(b)) self.assertEqual('This', b[1:5])
def test_literal02(self): """Test escape sequences in literal strings.""" filepath = 't/literal02.dat' with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # This is a string tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(2, len(b)) self.assertEqual(40, b[0]) self.assertEqual(41, b[1])
def test_literal03(self): """Test escape sequences in literal strings.""" filepath = 't/literal03.dat' with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # This is a string tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data for i in b: print(f'i="{i}"') self.assertEqual(9, len(b)) self.assertEqual(13, b[0]) # \r CR self.assertEqual(10, b[1]) # \n LF print(f'b[2]="{b[2]}"') self.assertEqual(8, b[2]) # \b BS self.assertEqual(9, b[3]) # \t TAB self.assertEqual(12, b[4]) self.assertEqual(40, b[5]) self.assertEqual(41, b[6]) self.assertEqual(0x5c, b[7]) self.assertEqual(83, b[8])
def parse_tokens(filepath): # Array for token storage tokens = [] # Parse a character stream into a token stream with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # tk.cc = tk.bf.next_byte() indent = 0 while True: t = tk.next_token() if t.type == EToken.EOF: break if t.type in [ EToken.ARRAY_END, EToken.DICT_END, EToken.OBJECT_END ]: indent -= 1 t.print_indented(indent) if t.type in [ EToken.ARRAY_BEGIN, EToken.DICT_BEGIN, EToken.OBJECT_BEGIN ]: indent += 1 tokens.append(t)
def test01(self): """Test simple next_token() and peek_token() calls.""" filepath = os.path.join(TokenStreamTest.path, 'token_stream.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # Retrieve a few tokens tok = tk.next_token() self.assertEqual(EToken.DICT_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'Contents', tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(6624, tok.data) # Now peek once tok2 = tk.peek_token() self.assertEqual(EToken.INTEGER, tok2.type) self.assertEqual(0, tok2.data) # Retrieve a peeked token tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(0, tok.data) # Peek 3 tokens ahead tok2 = tk.peek_token() self.assertEqual(EToken.OBJ_REF, tok2.type) tok2 = tk.peek_token() self.assertEqual(EToken.NAME, tok2.type) self.assertEqual(b'CropBox', tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.ARRAY_BEGIN, tok2.type) # Retrieve 2 tokens tok = tk.next_token() self.assertEqual(EToken.OBJ_REF, tok.type) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'CropBox', tok.data) # I still have the ARRAY_BEGIN in 'peeked' # I'm not sure this is the right spec... # Peeking 5 more tok2 = tk.peek_token() self.assertEqual(EToken.INTEGER, tok2.type) self.assertEqual(0, tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.INTEGER, tok2.type) self.assertEqual(0, tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.REAL, tok2.type) self.assertEqual(595.276, tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.REAL, tok2.type) self.assertEqual(841.89, tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.ARRAY_END, tok2.type) # Retrieve 1 plus 5 plus 1 tok = tk.next_token() self.assertEqual(EToken.ARRAY_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(0, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(0, tok.data) tok = tk.next_token() self.assertEqual(EToken.REAL, tok.type) self.assertEqual(595.276, tok.data) tok = tk.next_token() self.assertEqual(EToken.REAL, tok.type) self.assertEqual(841.89, tok.data) tok = tk.next_token() self.assertEqual(EToken.ARRAY_END, tok.type) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'MediaBox', tok.data)
def test05(self): """Test token seek and tell.""" filepath = os.path.join(TokenStreamTest.path, 'obj_stream3.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # Memorize position at the beginning, this bugs xpos = tk.tell() # tk.seek(0) # print(f'test05: seek(0): cc="{chr(tk.cc)}", bf.s_pos={tk.bf.s_pos}') # tk.seek(1) # print(f'test05: seek(1), cc="{chr(tk.cc)}", bf.s_pos={tk.bf.s_pos}') tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(98, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(73, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(5, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(19, tok.data) pos2 = tk.tell() # Go back tk.seek(xpos) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(98, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(73, tok.data) # Move forward tk.seek(pos2) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(18, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(33, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(45, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(66, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(13, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(2, tok.data) tok = tk.next_token() self.assertEqual(EToken.OBJ_REF, tok.type)
def test02(self): """Test simple next_token() calls.""" filepath = r't\token.dat' with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # [[[ tok = tk.next_token() self.assertEqual(EToken.ARRAY_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.ARRAY_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.ARRAY_BEGIN, tok.type) # <<>> >> tok = tk.next_token() self.assertEqual(EToken.DICT_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.DICT_END, tok.type) tok = tk.next_token() self.assertEqual(EToken.DICT_END, tok.type) # ] tok = tk.next_token() self.assertEqual(EToken.ARRAY_END, tok.type) # /// tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'', tok.data) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'', tok.data) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'', tok.data) for i in range(6): tok = tk.next_token() # >>\r\n<< tok = tk.next_token() self.assertEqual(EToken.DICT_END, tok.type) tok = tk.next_token() self.assertEqual(EToken.CRLF, tok.type) tok = tk.next_token() self.assertEqual(EToken.DICT_BEGIN, tok.type) # /a tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'a', tok.data) # /b tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'b', tok.data) # /c tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'c', tok.data) # /d tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'd', tok.data) tok = tk.next_token() self.assertEqual(EToken.DICT_END, tok.type)
class ObjectStream: # Initializer def __init__(self, filepath, f): self.tk = TokenStream(filepath, f) self.f = f self.tok = self.tk.next_token() # The xref table will be a property of the object stream ? def seek(self, offset): self.tk.seek(offset) # Normal init self.tok = self.tk.next_token() #--------------------------------------------------------------------------- # get_indirect_obj_def #--------------------------------------------------------------------------- def get_indirect_obj_def(self): """Found the opening OBJECT_BEGIN token, now get the entire object.""" # self.tok has an EToken.OBJECT_BEGIN, parse the following tokens. # Return is done with the closing token (already analyzed) in self.tok. tok = self.tok # Get the defined (internal) object self.tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) elif tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) # Get the defined (internal) object obj = self.next_object() if obj.type in [EObject.ERROR, EObject.EOF]: return obj # self.tok holds the next token, read but not yet analyzed tok = self.tok # Ignore any end-if-line marker if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) elif tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) if tok.type == EToken.OBJECT_END: return obj #--------------------------------------------------------------------------- # get_array #--------------------------------------------------------------------------- def get_array(self): """Found the opening ARRAY_BEGIN token, now get the entire array.""" # self.tok has an EToken.ARRAY_BEGIN, parse the following tokens. # Return is done with the closing token (already analyzed) in self.tok. # Prepare an array object arr = [] # FIXME shouldn't I ignore end-of-line characters ? tok = self.tk.next_token() while True: if tok.type == EToken.ARRAY_END: # It's a python array, but the elements are PdfObjects return PdfObject(EObject.ARRAY, arr) if tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) if tok.type == EToken.EOF: return PdfObject(EObject.EOF) # Ignore end-if-line markers if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tk.next_token() continue self.tok = tok obj = self.next_object() # self.tok holds the next token, read but not yet analyzed if obj.type in [EObject.ERROR, EObject.EOF]: return obj # self.tok holds the next token, read but not yet analyzed tok = self.tok arr.append(obj) #--------------------------------------------------------------------------- # get_dictionary #--------------------------------------------------------------------------- def get_dictionary(self): """Found the opening DICT_BEGIN token, now get the entire dictionary.""" # self.tok has an EToken.DICT_BEGIN, parse the following tokens. # Return is done with the closing token (already analyzed) in self.tok. # Prepare a dictionary object d = {} tok = self.tk.next_token() while True: if tok.type == EToken.DICT_END: self.tok = tok # It's a python dictionary, but the values are PdfObjects return PdfObject(EObject.DICTIONARY, d) if tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) if tok.type == EToken.EOF: return PdfObject(EObject.EOF) # Ignore end-if-line markers if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tk.next_token() elif tok.type == EToken.NAME: tok2 = self.tk.next_token() self.tok = tok2 obj = self.next_object() # FIXME: can any bytes object be decoded like this ? # FIXME: I've lost the keys' original bytes object d[tok.data.decode('unicode_escape')] = obj # The next token is already stored in self.tok, but it hasn't # been analyzed yet. tok = self.tok else: return PdfObject(EObject.ERROR) #--------------------------------------------------------------------------- # get_stream #--------------------------------------------------------------------------- # FIXME define a proper stream class, with the dictionary in it def get_stream(self, length): """Found the opening STREAM_BEGIN token, now get all the data.""" # self.tok has an EToken.STREAM_BEGIN, parse the following tokens. # Return is done with the closing token (already analyzed) in self.tok. # FIXME I need to stop testing EOF and ERROR after every single # next_XXX() function call, use exceptions instead. # Get the token that follows 'stream' (CRLF or LF) tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) # "The keyword stream that follows the stream dictionary shall be # followed by an end-of-line marker consisting of either a CARRIAGE # RETURN and a LINE FEED or just a LINE FEED, and not by a CARRIAGE # RETURN alone". PDF spec, § 7.3.8.1, page 19 if tok.type not in [EToken.LF, EToken.CRLF]: return PdfObject(EObject.ERROR) # Get the token with the stream data tok = self.tk.next_stream(length) if tok.type == EToken.EOF: return PdfObject(EObject.EOF) s = tok.data # "There should be an end-of-line marker after the data and before # endstream; this marker shall not be included in the stream length". # PDF spec, § 7.3.8.1, page 19 tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: return PdfObject(EObject.ERROR) # Get the closing STREAM_END tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) if tok.type != EToken.STREAM_END: return PdfObject(EObject.ERROR) # Return the stream data object, with the closing _END token return PdfObject(EObject.STREAM, data=s) #--------------------------------------------------------------------------- # deflate_stream #--------------------------------------------------------------------------- def deflate_stream(self, s, columns=None, predictor=None, W=None): """Decode stream s, encoded with flate, with predictor and W params.""" # s: original compressed data stream (stripped) # collumns: integer # predictor: integer with values in { 1, 2, 10-15 } # W: python array of integers # First, deflate the string zd = zlib.decompress(s) if not predictor: # No DecodeParms, so we assume no predictor # False means we have not done the un-predicting, just return zd return False, zd if predictor != 12: print(f'Predictor value {predictor} not supported (currently only 12)') return False, zd # From https://forums.adobe.com/thread/664902: "Strip off the last 10 # characters of the string. This is the CRC and is unnecessary to # extract the raw data". Not doing this, at this point. # Sum up the column widths. For the example above [1 2 1] would be # 4. This is one less than the number of bytes in each row. n = sum(W) # n == 4 width = n+1 # Split the string into rows by the column width: sum+1, or in our # example, 5. # Is the uncompressed stream length a multiple of this width ? if len(zd)%(width) == 0: print(f'*** Uncompressed len(zd)={len(zd)}, width={width}' + f', {len(zd)}={len(zd)//(width)}*{width}') else: print(f'*** Uncompressed len(zd)={len(zd)}, width={width}' + ', not a multiple') # zd is a bytes object prev = [0]*width nrows = len(zd)//(width) # 86 arr = [] for r in range(nrows): # 0..85 bs = '' rowdata = [x for x in zd[r*width:(r+1)*width]] # array of ints for i in range(1, width): rowdata[i] = (rowdata[i] + prev[i]) % 256 bs += format(rowdata[i], '08b') # Convert to binary string prev = rowdata # Update prev for next pass # Split the string according to W # print(f'{bs} len={len(bs)}') begin = 0 end = 8*W[0] type = int(bs[begin:end], 2) begin = 8*W[0] end = 8*(W[0] + W[1]) fld1 = int(bs[begin:end], 2) begin = 8*(W[0] + W[1]) end = begin + 8*W[2] fld2 = int(bs[begin:end], 2) arr.append((type, fld1, fld2)) # True means we have done the un-predicting, so what we return is an # array of 3-uples" return True, arr #--------------------------------------------------------------------------- # get_xref_section #--------------------------------------------------------------------------- def get_xref_section(self): """Parse a cross reference section into an object""" # self.tok has a EToken.XREF_SECTION, parse the following tokens. # "Each cross-reference section shall begin with a line containing the # keyword xref": this implies an end-of-line marker after 'xref' tok = self.tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: self.tok = tok # FIXME this way, self.tok will be analyzed again return PdfObject(EObject.ERROR) # Loop over cross-reference subsections self.xref_sec = XrefSection() while True: # Get a special token representing the sub-section header tok = self.tk.get_subsection_header() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) if tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) if tok.type == EToken.UNEXPECTED: # Couldn't parse the line as a sub-section header, this means # that the sub-section is over. The xref is stored as a # property of this ObjectSTream, and it is also returned. # State has been rolled back, so prepare to continue self.tok = self.tk.next_token() return PdfObject(EObject.XREF_SECTION, self.xref_sec) # Sub-section header was successfully parsed first_objn, entry_cnt = tok.data # I'm assuming entry_cnt is not 0. subs = XrefSubSection(first_objn, entry_cnt) for i in range(entry_cnt): # Get a special token representing a sub-section entry tok = self.tk.get_subsection_entry() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) if tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) subs.entries.append(tok.data) # Finish off the this sub-section self.xref_sec.sub_sections.append(subs) #--------------------------------------------------------------------------- # get_cross_reference #--------------------------------------------------------------------------- def get_cross_reference(self): """Parse a cross reference section into an object""" # The current token from the stream should be either a XREF_SECTION # (for a traditional cross_reference table) or an INTEGER, introducing # an indirect object definition, for a cross-reference stream # (available in PDF 1.5 and later) tok = self.tok if tok.type == EToken.EOF: return PdfObject(EObject.EOF) # Traditional if tok.type == EToken.XREF_SECTION: return self.get_xref_section() # Available in PDF 1.5 and later if tok.type == EToken.INTEGER: obj = self.next_object() if obj.type == EObject.IND_OBJ_DEF: return obj # Any other case is an error, because we were expecting to find a # cross-reference table, modern or traditional. return PdfObject(EObject.ERROR) #--------------------------------------------------------------------------- # next_object #--------------------------------------------------------------------------- def next_object(self): """Get the next object as a PdfObject.""" # Invariant: tok has been read from the stream, but not yet analyzed. It # is stored (persisted in between calls) in self.tok. This means that # every time control leaves this function (through return), it must # read, but not analyze, the next token, and store it in self.tok. tok = self.tok # Ignore CRLF (why do I parse the tokens then ?) while tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tok = self.tk.next_token() # Have we reached EOF ? if tok.type == EToken.EOF: return PdfObject(EObject.EOF) elif tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) elif tok.type == EToken.VERSION_MARKER: self.tok = self.tk.next_token() return PdfObject(EObject.VERSION_MARKER, data=tok.data) # Now analyze tok: is it a boolean ? elif tok.type == EToken.TRUE: self.tok = self.tk.next_token() return PdfObject(EObject.BOOLEAN, True) elif tok.type == EToken.FALSE: self.tok = self.tk.next_token() return PdfObject(EObject.BOOLEAN, False) # Is it an integer number ? elif tok.type == EToken.INTEGER: # Attempt to find the longest match first. Object definitions and # references are two integers plus another token, they must be # parsed first, and if not found, then we'll settle for the simple # integer. # Lookahead 1 token. If we find another integer, keep looking. # If we find an OBJECT_BEGIN, then we have an indirect object # definition. # If we find an OBJ_REF, then we have an indirect reference. pos = self.tk.tell() tok2 = self.tk.next_token() if tok2.type == EToken.INTEGER: # Keep looking tok3 = self.tk.next_token() if tok3.type == EToken.OBJECT_BEGIN: # Start creating the object with the object number (from # tok) and generation number (from tok2) # Get the defined (internal) object self.tok = tok3 obj = self.get_indirect_obj_def() if obj.type in [EObject.ERROR, EObject.EOF]: return obj self.tok = self.tk.next_token() return PdfObject(EObject.IND_OBJ_DEF, data=dict(obj=obj, objn=tok.data, gen=tok2.data)) elif tok3.type == EToken.OBJ_REF: # self.tk.next_token() # peeked tok2 # self.tk.next_token() # peeked tok3 self.tok = self.tk.next_token() return PdfObject(EObject.IND_OBJ_REF, data=dict(objn=tok.data, gen=tok2.data)) # Ignore tok2, we re-read it anyway self.tk.seek(pos) x = tok.data self.tok = self.tk.next_token() return PdfObject(EObject.INTEGER, x) # Is it a real number ? elif tok.type == EToken.REAL: self.tok = self.tk.next_token() return PdfObject(EObject.REAL, tok.data) # Is it a string ? elif tok.type in [EToken.LITERAL_STRING, EToken.HEX_STRING]: self.tok = self.tk.next_token() return PdfObject(EObject.STRING, tok.data) # bytearray # Is it a name ? elif tok.type == EToken.NAME: self.tok = self.tk.next_token() return PdfObject(EObject.NAME, tok.data) # bytearray # Is it an array ? elif tok.type == EToken.ARRAY_BEGIN: # self.tok already has the right value, tok was taken from there obj = self.get_array() # self.tok == ARRAY_END if obj.type in [EObject.ERROR, EObject.EOF]: return obj self.tok = self.tk.next_token() return obj # Is it a dictionary ? or a (dictionary, stream) couple ? elif tok.type == EToken.DICT_BEGIN: # self.tok already has the right value, tok was taken from there obj = self.get_dictionary() # self.tok == DICT_END if obj.type in [EObject.ERROR, EObject.EOF]: return obj while True: self.tok = self.tk.next_token() if self.tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break if self.tok.type != EToken.STREAM_BEGIN: return obj # return the dict # We have found a STREAM_BEGIN token, so 'obj' is the stream # dictionary # FIXME this may not be right. Length is given as an indirect # object ref, we must have parsed all the xref tables at this point # if we want to parse this stream. o = obj.data['Length'] if o.type == EObject.INTEGER: ln = o.data elif o.type == EObject.IND_OBJ_REF: ln = self.deref_object(o) else: return PdfObject(EObject.ERROR) obj2 = self.get_stream(ln) # FIXME use exceptions instead if obj2.type in [EObject.ERROR, EObject.EOF]: return obj2 self.tok = self.tk.next_token() return PdfObject(EObject.COUPLE, data=(obj, obj2)) # Is it a xref section ? elif tok.type == EToken.XREF_SECTION: obj = self.get_xref_section() # self.tok already holds the next token return obj # Is it a trailer ? elif tok.type == EToken.TRAILER: tok = self.tk.next_token() # Ignore CRLF (why do I parse the tokens then ?) while tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tk.next_token() if tok.type != EToken.DICT_BEGIN: # FIXME specify once and for all which token I want to see when # an error has been detected. The question is "how do I recover # from this error ?" self.tok = self.tk.next_token() return PdfObject(EObject.ERROR) obj = self.get_dictionary() self.tok = self.tk.next_token() return PdfObject(EObject.TRAILER, data=obj) elif tok.type == EToken.STARTXREF: self.tok = self.tk.next_token() return PdfObject(EObject.STARTXREF) elif tok.type == EToken.EOF_MARKER: self.tok = self.tk.next_token() return PdfObject(EObject.EOF_MARKER) # Is it a stream ? Wrong. Streams are preceded by a dictionary. elif tok.type == EToken.STREAM_BEGIN: return PdfObject(EObject.ERROR) # Is it null ? elif tok.type == EToken.NULL: self.tok = self.tk.next_token() return PdfObject(EObject.NULL) # Nothing that was expected here else: self.tok = self.tk.next_token() return PdfObject(EObject.ERROR) #--------------------------------------------------------------------------- # deref_object - read an indirect object from the file #--------------------------------------------------------------------------- def deref_object(self, o): """Find an object's definition from a reference.""" if o.type != EObject.IND_OBJ_REF: print(f'Expecting an indirect object reference, got "{o.type}"' + ' instead') return None if not self.xref_sec: return None # Now use objn to search the xref table for the file offset where # this catalog dictionary object can be found; seek the file to # that offset, and do another ob.next_object() # Catalog dictionary object is found at this offset, go there entry = self.xref_sec.get_object(o.data['objn'], o.data['gen']) if not entry: return None offset, _, _ = entry self.seek(offset) # Now read the next char, this will be the beginning of # "6082 0 obj^M<</Metadata 6125 0 R ..." where 6082 is the objn o = self.next_object() if o.type != EObject.IND_OBJ_DEF: print(f'Expecting an indirect object definition, got "{o.type}"' + ' instead') return None # The indirect object definition surrounds the object we want return o.data['obj']
def test_literal01(self): """Test the set of example strings from the spec.""" filepath = 't/literal01.dat' with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # This is a string tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(16, len(b)) self.assertEqual(b'This', b[0:4]) # Skip over end of lines while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # Strings may contain newlines\n and such self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertTrue(b.startswith(b'Strings may')) self.assertTrue(b.endswith(b'such.')) # Skip over end of lines while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # Strings may contain balanced parentheses... self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(b'(x)', b[41:44]) self.assertTrue(b.endswith(b'% and so on).')) # Skip over end of lines while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # The following is an empty string. self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(b'The following is an empty string.', b) while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # Empty string self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(0, len(b)) self.assertEqual(b'', b) # Skip over end of lines while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # It has zero (0) length. self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(23, len(b)) self.assertEqual(b'It has zero (0) length.', b)