def uncompress(mylist, leave_raw=False, warnings=set(), flate=PdfName.FlateDecode, decompress=decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms or obj.DP if ftype != flate: msg = ('Not decompressing: cannot use filter %s' ' with parameters %s') % (repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() try: data = dco.decompress(convert_store(obj.stream)) except Exception as s: error = str(s) else: error = None if isinstance(parms, PdfArray): oldparms = parms parms = PdfDict() for x in oldparms: parms.update(x) if parms: predictor = int(parms.Predictor or 1) columns = int(parms.Columns or 1) colors = int(parms.Colors or 1) bpc = int(parms.BitsPerComponent or 8) if 10 <= predictor <= 15: data, error = flate_png(data, predictor, columns, colors, bpc) elif predictor != 1: error = ('Unsupported flatedecode predictor %s' % repr(predictor)) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = ('Unconsumed compression data: %s' % repr(dco.unused_data[:20])) if error is None: obj.Filter = None obj.stream = data if leave_raw else convert_load(data) else: log.error('%s %s' % (error, repr(obj.indirect))) ok = False return ok
def readpages(self, node): pagename = PdfName.Page pagesname = PdfName.Pages catalogname = PdfName.Catalog typename = PdfName.Type kidname = PdfName.Kids # PDFs can have arbitrarily nested Pages/Page # dictionary structures. def readnode(node): nodetype = node[typename] if nodetype == pagename: yield node elif nodetype == pagesname: if type(node[kidname]) == PdfArray: for node in node[kidname]: for node in readnode(node): yield node # else it's one PdfDict else: for node in readnode(node[kidname]): yield node elif nodetype == catalogname: for node in readnode(node[pagesname]): yield node else: log.error('Expected /Page or /Pages dictionary, got %s' % repr(node)) try: return list(readnode(node)) except (AttributeError, TypeError), s: log.error('Invalid page tree: %s' % s) return []
def uncompress(mylist, warnings=set(), flate = PdfName.FlateDecode, decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms if ftype != flate or parms is not None: msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() error = None try: data = dco.decompress(obj.stream) except Exception, s: error = str(s) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = 'Unconsumed compression data: %s' % repr(dco.unused_data[:20]) if error is None: obj.Filter = None obj.stream = data else: log.error('%s %s' % (error, repr(obj.indirect)))
def readpages(self, node): pagename=PdfName.Page pagesname=PdfName.Pages catalogname = PdfName.Catalog typename = PdfName.Type kidname = PdfName.Kids # PDFs can have arbitrarily nested Pages/Page # dictionary structures. def readnode(node): nodetype = node[typename] if nodetype == pagename: yield node elif nodetype == pagesname: for node in node[kidname]: for node in readnode(node): yield node elif nodetype == catalogname: for node in readnode(node[pagesname]): yield node else: log.error('Expected /Page or /Pages dictionary, got %s' % repr(node)) try: return list(readnode(node)) except (AttributeError, TypeError), s: log.error('Invalid page tree: %s' % s) return []
def parsexref(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' fdata = source.fdata setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append next = source.next tok = next() if tok != 'xref': source.exception('Expected "xref" keyword') start = source.floc try: while 1: tok = next() if tok == 'trailer': return startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. See if # we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = (int(tokens[0]), int(tokens[1]), tokens[2]) if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) add_offset(offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end source.next() except: source.floc = start source.exception('Invalid table format')
def old_parsexref(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' fdata = source.fdata setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append next = source.next tok = next() if tok != 'xref': source.exception('Expected "xref" keyword') start = source.floc try: while 1: tok = next() if tok == 'trailer': return startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. See if # we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = (int(tokens[0]), int(tokens[1]), tokens[2]) if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) add_offset(offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end source.next() except: source.floc = start source.exception('Invalid table format')
def readnode(node): nodetype = node[typename] if nodetype == pagename: yield node elif nodetype == pagesname: for node in node[kidname]: for node in readnode(node): yield node elif nodetype == catalogname: for node in readnode(node[pagesname]): yield node else: log.error('Expected /Page or /Pages dictionary, got %s' % repr(node))
def uncompress(mylist, warnings=set(), flate=PdfName.FlateDecode, decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms if ftype != flate or parms is not None: msg = ('Not decompressing: cannot use filter %s ' 'with parameters %s' % (repr(ftype), repr(parms))) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() error = None try: data = dco.decompress(obj.stream) except Exception, s: error = str(s) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = ('Unconsumed compression data: %s' % repr(dco.unused_data[:20])) if error is None: obj.Filter = None obj.stream = data else: log.error('%s %s' % (error, repr(obj.indirect)))
def __init__(self, pageinfo='', **kw): pageinfo = pageinfo.split('#', 1) if len(pageinfo) == 2: pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') for key in 'page viewrect'.split(): if pageinfo[0].startswith(key + '='): break else: self.docname = pageinfo.pop(0) for item in pageinfo: key, value = item.split('=') key = key.strip() value = value.replace(',', ' ').split() if key in ('page', 'rotate'): assert len(value) == 1 setattr(self, key, int(value[0])) elif key == 'viewrect': assert len(value) == 4 setattr(self, key, [float(x) for x in value]) else: log.error('Unknown option: %s', key) for key, value in kw.iteritems(): assert hasattr(self, key), key setattr(self, key, value)
def __init__(self, pageinfo='', **kw): pageinfo = pageinfo.split('#', 1) if len(pageinfo) == 2: pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') for key in 'page viewrect'.split(): if pageinfo[0].startswith(key + '='): break else: self.docname = pageinfo.pop(0) for item in pageinfo: key, value = item.split('=') key = key.strip() value = value.replace(',', ' ').split() if key in ('page', 'rotate'): assert len(value) == 1 setattr(self, key, int(value[0])) elif key == 'viewrect': assert len(value) == 4 setattr(self, key, [float(x) for x in value]) else: log.error('Unknown option: %s', key) for key, value in iteritems(kw): assert hasattr(self, key), key setattr(self, key, value)
class PdfReader(PdfDict): warned_bad_stream_start = False # Use to keep from spewing warnings warned_bad_stream_end = False # Use to keep from spewing warnings def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int): ''' Return a previously loaded indirect object, or create a placeholder for it. ''' key = int(objnum), int(gennum) result = self.indirect_objects.get(key) if result is None: self.indirect_objects[key] = result = PdfIndirect(key) self.deferred_objects.add(key) result._loader = self.loadindirect return result def readarray(self, source, PdfArray=PdfArray): ''' Found a [ token. Parse the tokens after that. ''' specialget = self.special.get result = [] pop = result.pop append = result.append for value in source: if value in ']R': if value == ']': break generation = pop() value = self.findindirect(pop(), generation) else: func = specialget(value) if func is not None: value = func(source) append(value) return PdfArray(result) def readdict(self, source, PdfDict=PdfDict): ''' Found a << token. Parse the tokens after that. ''' specialget = self.special.get result = PdfDict() next = source.next tok = next() while tok != '>>': if not tok.startswith('/'): # Just skip the incorrect /name object. source.warning('Expected PDF /name object') tok = next() continue key = tok value = next() func = specialget(value) if func is not None: # Just keep working when bad token occurs. if func == self.badtoken: tok = value continue value = func(source) tok = next() else: tok = next() if value.isdigit() and tok.isdigit(): if next() != 'R': source.exception('Expected "R" following two integers') value = self.findindirect(value, tok) tok = next() result[key] = value return result def empty_obj(self, source, PdfObject=PdfObject): ''' Some silly git put an empty object in the file. Back up so the caller sees the endobj. ''' source.floc = source.tokstart return PdfObject() def badtoken(self, source): ''' Didn't see that coming. ''' source.exception('Unexpected delimiter') def findstream(self, obj, tok, source, PdfDict=PdfDict, isinstance=isinstance, len=len): ''' Figure out if there is a content stream following an object, and return the start pointer to the content stream if so. (We can't read it yet, because we might not know how long it is, because Length might be an indirect object.) ''' isdict = isinstance(obj, PdfDict) if not isdict or tok != 'stream': source.exception("Expected 'endobj'%s token", isdict and " or 'stream'" or '') fdata = source.fdata startstream = source.tokstart + len(tok) # Skip the possible delimiters. possible_delimiters = ('\r', '\n', ' ') gotcr = gotlf = False while fdata[startstream] in possible_delimiters: if fdata[startstream] == '\r': gotcr = True if fdata[startstream] == '\n': gotlf = True startstream += 1 if not gotlf: if not gotcr: source.warning(r'stream keyword not followed by \n') self.private.warned_bad_stream_start = True if not self.warned_bad_stream_start: source.warning(r"stream keyword terminated by \r without \n") self.private.warned_bad_stream_start = True return startstream def readstream(self, obj, startstream, source, streamending='endstream endobj'.split(), int=int): fdata = source.fdata # Get a length by looking 'endstream' end_loc = fdata.find('endstream', startstream) possible_delimiters = ('\r', '\n', ' ') while fdata[end_loc - 1] in possible_delimiters: end_loc -= 1 observed_length = end_loc - startstream if obj.Length == None: length = observed_length source.warning( 'Lacking the stream length declaration, using the observed value %d.' % (observed_length)) else: try: length = int(obj.Length) except: source.warning( 'Incorrect representation of stream length: %s. Use observed value %d instead.' % (obj.Length, observed_length)) length = observed_length if length != observed_length: source.warning( 'Inconsistent stream length: %d declared, %d observed.' % (length, observed_length)) length = observed_length source.floc = target_endstream = startstream + length endit = source.multiple(2) obj._stream = fdata[startstream:target_endstream] if endit == streamending: return # The length attribute does not match the distance between the # stream and endstream keywords. do_warn, self.private.warned_bad_stream_end = ( self.warned_bad_stream_end, False) # TODO: Extract maxstream from dictionary of object offsets # and use rfind instead of find. maxstream = len(fdata) - 20 endstream = fdata.find('endstream', startstream, maxstream) source.floc = startstream room = endstream - startstream if endstream < 0: source.error('Could not find endstream') return if (length == room + 1 and fdata[startstream - 2:startstream] == '\r\n'): source.warning(r"stream keyword terminated by \r without \n") obj._stream = fdata[startstream - 1:target_endstream - 1] return source.floc = endstream if length > room: source.error( 'stream /Length attribute (%d) appears to ' 'be too big (size %d) -- adjusting', length, room) obj.stream = fdata[startstream:endstream] return if fdata[target_endstream:endstream].rstrip(): source.error( 'stream /Length attribute (%d) might be ' 'smaller than data size (%d)', length, room) obj.stream = fdata[startstream:endstream] return endobj = fdata.find('endobj', endstream, maxstream) if endobj < 0: source.error('Could not find endobj after endstream') return if fdata[endstream:endobj].rstrip() != 'endstream': source.error('Unexpected data between endstream and endobj') return source.error('Illegal endstream/endobj combination') def loadindirect(self, key): result = self.indirect_objects.get(key) if not isinstance(result, PdfIndirect): return result source = self.source offset = int(self.source.obj_offsets.get(key, '0')) if not offset: log.warning("Did not find PDF object %s" % (key, )) return None # Read the object header and validate it objnum, gennum = key source.floc = offset objid = source.multiple(3) ok = len(objid) == 3 ok = ok and objid[0].isdigit() and int(objid[0]) == objnum ok = ok and objid[1].isdigit() and int(objid[1]) == gennum ok = ok and objid[2] == 'obj' if not ok: source.floc = offset source.next() objheader = '%d %d obj' % (objnum, gennum) fdata = source.fdata offset2 = (fdata.find('\n' + objheader) + 1 or fdata.find('\r' + objheader) + 1) if (not offset2 or fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0): source.warning("Expected indirect object '%s'" % objheader) return None source.warning("Indirect object %s found at incorrect " "offset %d (expected offset %d)" % (objheader, offset2, offset)) source.floc = offset2 + len(objheader) # Read the object, and call special code if it starts # an array or dictionary obj = source.next() func = self.special.get(obj) if func is not None: obj = func(source) self.indirect_objects[key] = obj self.deferred_objects.remove(key) # Mark the object as indirect, and # add it to the list of streams if it starts a stream obj.indirect = key tok = source.next() if tok != 'endobj': self.readstream(obj, self.findstream(obj, tok, source), source) return obj def findxref(fdata): ''' Find the cross reference section at the end of a file ''' startloc = fdata.rfind('startxref') if startloc < 0: raise PdfParseError('Did not find "startxref" at end of file') source = PdfTokens(fdata, startloc, False) tok = source.next() assert tok == 'startxref' # (We just checked this...) tableloc = source.next_default() if not tableloc.isdigit(): source.exception('Expected table location') if source.next_default().rstrip().lstrip('%') != 'EOF': source.exception('Expected %%EOF') return startloc, PdfTokens(fdata, int(tableloc), True) findxref = staticmethod(findxref) # Parse through the byte stream when there's no xref table available. def slow_parse_xref(self, source): setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append def get_obj_ids(fdata): m = re.findall('\d+\s\d+\sobj', fdata, re.DOTALL) return m fdata = source.fdata obj_ids = get_obj_ids(fdata) xref = {} cur_pos = 0 for obj_id in obj_ids: cur_pos = fdata.find(obj_id, cur_pos) #print obj_id, cur_pos obj_idx_id = int(obj_id.split()[0]) obj_gen_num = int(obj_id.split()[1]) xref[obj_idx_id] = cur_pos cur_pos += len( obj_id ) # Done: Fixed a parsing bug here. "7 0 obj" and "17 o obj" are confusing before. #print xref for objnum, offset in xref.items(): generation = 0 setdefault((objnum, generation), offset) add_offset(offset) def load_stream_objects(self, object_streams): # read object streams objs = [] for num in object_streams: obj = self.findindirect(num, 0).real_value() assert obj.Type == '/ObjStm' objs.append(obj) # read objects from stream if objs: # Decrypt if self.crypt_filters is not None: crypt.decrypt_objects(objs, self.stream_crypt_filter, self.crypt_filters) # Decompress uncompress(objs) for obj in objs: objsource = PdfTokens(obj.stream, 0, False) next = objsource.next offsets = [] firstoffset = int(obj.First) while objsource.floc < firstoffset: offsets.append((int(next()), firstoffset + int(next()))) for num, offset in offsets: # Read the object, and call special code if it starts # an array or dictionary objsource.floc = offset sobj = next() func = self.special.get(sobj) if func is not None: sobj = func(objsource) key = (num, 0) self.indirect_objects[key] = sobj if key in self.deferred_objects: self.deferred_objects.remove(key) # Mark the object as indirect, and # add it to the list of streams if it starts a stream sobj.indirect = key ### YIZHENG: if xref is a table, pretty much the same as old_parsexref() def parse_xref_table(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' setdefault = source.obj_offsets.setdefault next = source.next # plain xref table start = source.floc try: while 1: tok = next() if tok == 'trailer': return startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. # See if we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = (int(tokens[0]), int(tokens[1]), tokens[2]) if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end next() except: source.floc = start source.exception('Invalid table format') ### YIZHENG: if the xref is stream def parse_xref_stream(self, source, int=int, range=range, enumerate=enumerate, islice=itertools.islice, defaultdict=collections.defaultdict, hexlify=binascii.hexlify): ''' Parse (one of) the cross-reference file section(s) ''' def readint(s, lengths): offset = 0 for length in itertools.cycle(lengths): next = offset + length yield int(hexlify(s[offset:next]), 16) if length else None offset = next setdefault = source.obj_offsets.setdefault next = source.next # check for xref stream object objid = source.multiple(3) ok = len(objid) == 3 ok = ok and objid[0].isdigit() ok = ok and objid[1] == 'obj' ok = ok and objid[2] == '<<' if not ok: source.exception('Expected xref stream start') obj = self.readdict(source) if obj.Type != PdfName.XRef: source.exception('Expected dict type of /XRef') tok = next() self.readstream(obj, self.findstream(obj, tok, source), source, True) old_strm = obj.stream if not uncompress([obj], True): source.exception('Could not decompress Xref stream') stream = obj.stream # Fix for issue #76 -- goofy compressed xref stream # that is NOT ACTUALLY COMPRESSED stream = stream if stream is not old_strm else convert_store(old_strm) num_pairs = obj.Index or PdfArray(['0', obj.Size]) num_pairs = [int(x) for x in num_pairs] num_pairs = zip(num_pairs[0::2], num_pairs[1::2]) entry_sizes = [int(x) for x in obj.W] if len(entry_sizes) != 3: source.exception('Invalid entry size') object_streams = defaultdict(list) get = readint(stream, entry_sizes) for objnum, size in num_pairs: for cnt in range(size): xtype, p1, p2 = islice(get, 3) if xtype in (1, None): if p1: setdefault((objnum, p2 or 0), p1) elif xtype == 2: object_streams[p1].append((objnum, p2)) objnum += 1 obj.private.object_streams = object_streams return obj ### YIZHENG: new parsexref def parsexref(self, source): ''' Parse (one of) the cross-reference file section(s) ''' next = source.next try: tok = next() except StopIteration: tok = '' if tok.isdigit(): return self.parse_xref_stream(source), True elif tok == 'xref': self.parse_xref_table(source) tok = next() if tok != '<<': source.exception('Expected "<<" starting catalog') return self.readdict(source), False else: source.exception('Expected "xref" keyword or xref stream object') def old_parsexref(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' fdata = source.fdata setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append next = source.next tok = next() if tok != 'xref': source.exception('Expected "xref" keyword') start = source.floc try: while 1: tok = next() if tok == 'trailer': return startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. See if # we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = (int(tokens[0]), int(tokens[1]), tokens[2]) if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) add_offset(offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end source.next() except: source.floc = start source.exception('Invalid table format') def readpages(self, node): pagename = PdfName.Page pagesname = PdfName.Pages catalogname = PdfName.Catalog typename = PdfName.Type kidname = PdfName.Kids # PDFs can have arbitrarily nested Pages/Page # dictionary structures. def readnode(node): nodetype = node[typename] if nodetype == pagename: yield node elif nodetype == pagesname: if type(node[kidname]) == PdfArray: for node in node[kidname]: for node in readnode(node): yield node # else it's one PdfDict else: for node in readnode(node[kidname]): yield node elif nodetype == catalogname: for node in readnode(node[pagesname]): yield node else: log.error('Expected /Page or /Pages dictionary, got %s' % repr(node)) try: return list(readnode(node)) except (AttributeError, TypeError), s: log.error('Invalid page tree: %s' % s) return [] except RuntimeError, s: log.error('Invalid page tree RuntimeError: %s' % s) return []
def error(self, *arg): log.error(self.msg(*arg))
def uncompress(mylist, warnings=set(), flate=PdfName.FlateDecode, decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms if ftype != flate or parms is not None: msg = 'Not decompressing: cannot use filter %s with parameters %s' % ( repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() error = None try: data = dco.decompress(obj.stream) if parms: # try png predictor predictor = int(parms['/Predictor']) or 1 # predictor 1 == no predictor if predictor != 1: columns = int(parms['/Columns']) # PNG prediction: if predictor >= 10 and predictor <= 15: output = StringIO() # PNG prediction can vary from row to row rowlen = columns + 1 assert len(data) % rowlen == 0 prev_rowdata = (0, ) * rowlen for row in xrange(len(data) / rowlen): rowdata = [ ord(x) for x in data[(row * rowlen):((row + 1) * rowlen)] ] filter_byte = rowdata[0] if filter_byte == 0: pass elif filter_byte == 1: for i in xrange(2, rowlen): rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256 elif filter_byte == 2: for i in xrange(1, rowlen): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 else: # unsupported PNG filter raise Exception( ('Unsupported PNG ' 'filter %r') % filter_byte) prev_rowdata = rowdata output.write(''.join( [chr(x) for x in rowdata[1:]])) data = output.getvalue() else: # unsupported predictor raise Exception(('Unsupported flatedecode' ' predictor %r') % predictor) except Exception, s: error = str(s) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = 'Unconsumed compression data: %s' % repr( dco.unused_data[:20]) if error is None: obj.Filter = None obj.stream = data else: log.error('%s %s' % (error, repr(obj.indirect)))
def uncompress(mylist, warnings=set(), flate=PdfName.FlateDecode, decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms if ftype != flate or parms is not None: msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() error = None try: data = dco.decompress(obj.stream) if parms: # try png predictor predictor = int(parms['/Predictor']) or 1 # predictor 1 == no predictor if predictor != 1: columns = int(parms['/Columns']) # PNG prediction: if predictor >= 10 and predictor <= 15: output = StringIO() # PNG prediction can vary from row to row rowlen = columns + 1 assert len(data) % rowlen == 0 prev_rowdata = (0,) * rowlen for row in xrange(len(data) / rowlen): rowdata = [ord(x) for x in data[(row * rowlen):((row + 1) * rowlen)]] filter_byte = rowdata[0] if filter_byte == 0: pass elif filter_byte == 1: for i in xrange(2, rowlen): rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256 elif filter_byte == 2: for i in xrange(1, rowlen): rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 else: # unsupported PNG filter raise Exception(('Unsupported PNG ' 'filter %r') % filter_byte) prev_rowdata = rowdata output.write(''.join([chr(x) for x in rowdata[1:]])) data = output.getvalue() else: # unsupported predictor raise Exception(('Unsupported flatedecode' ' predictor %r') % predictor) except Exception, s: error = str(s) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = 'Unconsumed compression data: %s' % repr( dco.unused_data[:20]) if error is None: obj.Filter = None obj.stream = data else: log.error('%s %s' % (error, repr(obj.indirect)))
def __init__(self, fname=None, fdata=None, decompress=False, decrypt=False, password='', disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc + 1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") # the name in slowparsing is newdict self.update(newdict) else: """ startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict # old name is newdict, below the new name is trailer self.update(newdict) """ ### NEW STUFF BEGINS HERE startloc, source = self.findxref(fdata) private.source = source # Find all the xref tables/streams, and # then deal with them backwards. xref_list = [] while 1: source.obj_offsets = {} trailer, is_stream = self.parsexref(source) prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref' and not xref_list: source.warning('Expected "startxref" ' 'at end of xref table') break xref_list.append((source.obj_offsets, trailer, is_stream)) source.floc = int(prev) #print 'xref_list:', xref_list #print 'trailer:', trailer # Handle document encryption private.crypt_filters = None if decrypt and PdfName.Encrypt in trailer: identity_filter = crypt.IdentityCryptFilter() crypt_filters = {PdfName.Identity: identity_filter} private.crypt_filters = crypt_filters private.stream_crypt_filter = identity_filter private.string_crypt_filter = identity_filter if not crypt.HAS_CRYPTO: raise PdfParseError( 'Install PyCrypto to enable encryption support') self._parse_encrypt_info(source, password, trailer) if is_stream: self.load_stream_objects(trailer.object_streams) while xref_list: later_offsets, later_trailer, is_stream = xref_list.pop() source.obj_offsets.update(later_offsets) if is_stream: trailer.update(later_trailer) self.load_stream_objects(later_trailer.object_streams) else: trailer = later_trailer trailer.Prev = None if (trailer.Version and float(trailer.Version) > float(self.version)): self.private.version = trailer.Version if decrypt: self.decrypt_all() trailer.Encrypt = None if is_stream: self.Root = trailer.Root self.Info = trailer.Info self.ID = trailer.ID self.Size = trailer.Size self.Encrypt = trailer.Encrypt else: self.update(trailer) ### NEW STUFF ENDS HERE # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def parsexref(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' def _pairs(array): i = 0 while 1: yield int(array[i]), int(array[i + 1]) i += 2 if (i + 1) >= len(array): break def convert_to_int(d, size): if size > 8: source.exception('Invalid size in convert_to_int') d = '\x00\x00\x00\x00\x00\x00\x00\x00' + d d = d[-8:] return struct.unpack('>q', d)[0] def read_trailer(): tok = next() if tok != '<<': source.exception('Expected "<<" starting catalog') return self.readdict(source) setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append next = source.next tok = next() if tok.isdigit(): # check for xref stream object objid = source.multiple(2) ok = len(objid) == 2 ok = ok and objid[0].isdigit() ok = ok and objid[1] == 'obj' if ok: next() # start of dict obj = self.readdict(source) assert obj.Type == '/XRef' tok = next() end = source.floc + int(obj.Length) self.readstream(obj, self.findstream(obj, tok, source), source) uncompress([obj]) num_pairs = obj.Index or PdfArray(['0', obj.Size]) entry_sizes = [int(x) for x in obj.W] object_streams = {} for num, size in _pairs(num_pairs): cnt = 0 stream_offset = 0 while cnt < size: for i in range(len(entry_sizes)): d = obj.stream[stream_offset:stream_offset + entry_sizes[i]] stream_offset += entry_sizes[i] di = convert_to_int(d, entry_sizes[i]) if i == 0: xref_type = di if xref_type == 0 and entry_sizes[0] == 0: xref_type = 1 elif i == 1: if xref_type == 1: offset = di elif xref_type == 2: objnum = di elif i == 2: if xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 1 and offset != 0: setdefault((num, generation), offset) add_offset(offset) elif xref_type == 2: if not objnum in object_streams: object_streams[objnum] = [] object_streams[objnum].append(obstr_idx) cnt += 1 num += 1 self.load_stream_objects(object_streams) source.floc = end endit = source.multiple(2) if endit != ['endstream', 'endobj']: source.exception('Expected endstream endobj') return obj else: source.exception('Expected xref stream') elif tok == 'xref': # plain xref table start = source.floc try: while 1: tok = next() if tok == 'trailer': return read_trailer() startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. # See if we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = \ int(tokens[0]), int(tokens[1]), tokens[2] if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) add_offset(offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end next() except: source.floc = start source.exception('Invalid table format') return read_trailer() else: source.exception('Expected "xref" keyword or xref stream object')
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc + 1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") else: startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning( 'Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = {'<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc+1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") else: startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()