def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = {'<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc+1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") else: startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def __init__(self, fname=None, fdata=None, decompress=False, decrypt=False, password='', disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc + 1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") # the name in slowparsing is newdict self.update(newdict) else: """ startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning('Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict # old name is newdict, below the new name is trailer self.update(newdict) """ ### NEW STUFF BEGINS HERE startloc, source = self.findxref(fdata) private.source = source # Find all the xref tables/streams, and # then deal with them backwards. xref_list = [] while 1: source.obj_offsets = {} trailer, is_stream = self.parsexref(source) prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref' and not xref_list: source.warning('Expected "startxref" ' 'at end of xref table') break xref_list.append((source.obj_offsets, trailer, is_stream)) source.floc = int(prev) #print 'xref_list:', xref_list #print 'trailer:', trailer # Handle document encryption private.crypt_filters = None if decrypt and PdfName.Encrypt in trailer: identity_filter = crypt.IdentityCryptFilter() crypt_filters = {PdfName.Identity: identity_filter} private.crypt_filters = crypt_filters private.stream_crypt_filter = identity_filter private.string_crypt_filter = identity_filter if not crypt.HAS_CRYPTO: raise PdfParseError( 'Install PyCrypto to enable encryption support') self._parse_encrypt_info(source, password, trailer) if is_stream: self.load_stream_objects(trailer.object_streams) while xref_list: later_offsets, later_trailer, is_stream = xref_list.pop() source.obj_offsets.update(later_offsets) if is_stream: trailer.update(later_trailer) self.load_stream_objects(later_trailer.object_streams) else: trailer = later_trailer trailer.Prev = None if (trailer.Version and float(trailer.Version) > float(self.version)): self.private.version = trailer.Version if decrypt: self.decrypt_all() trailer.Encrypt = None if is_stream: self.Root = trailer.Root self.Info = trailer.Info self.ID = trailer.ID self.Size = trailer.Size self.Encrypt = trailer.Encrypt else: self.update(trailer) ### NEW STUFF ENDS HERE # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True, slow_parsing=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) endloc = fdata.rfind('%EOF') if endloc < 0: log.error('EOF mark not found: %s' % repr(fdata[-20:])) endloc = len(fdata) - 6 endloc += 6 junk = fdata[endloc:] # Done: It is not necessary to truncate the string. # Some PDFs just use wrong EOF at the end to confuse parsers. #fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken if slow_parsing == True: startloc = 0 source = PdfTokens(fdata, startloc, True) private.source = source # Calling next() just for complete the structure of source by adding source.current. source.next() source.all_offsets = [] source.obj_offsets = {} self.slow_parse_xref(source) # Done: add slow parsing for multiple trailers. trailer_loc = fdata.find('trailer') newdict = None while trailer_loc >= 0: source.floc = trailer_loc assert source.next() == "trailer" # trailer tok = source.next() # << if tok != '<<': source.exception('Expected "<<" starting catalog') # Ignored the corrupted trailer. try: tmpdict = self.readdict(source) except: pass else: if not newdict: newdict = tmpdict else: newdict.update(tmpdict) finally: trailer_loc = fdata.find('trailer', trailer_loc + 1) if newdict is not None: newdict.Prev = None else: source.exception("No trailer.") else: startloc, source = self.findxref(fdata) private.source = source xref_table_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables self.parsexref(source) tok = source.next() if tok != '<<': source.exception('Expected "<<" starting catalog') newdict = self.readdict(source) token = source.next() if token != 'startxref' and not xref_table_list: source.warning( 'Expected "startxref" at end of xref table') # Loop if any previously-written tables. prev = newdict.Prev if prev is None: break if not xref_table_list: newdict.Prev = None original_indirect = self.indirect_objects.copy() original_newdict = newdict source.floc = int(prev) xref_table_list.append(source.obj_offsets) self.indirect_objects.clear() if xref_table_list: for update in reversed(xref_table_list): source.obj_offsets.update(update) self.indirect_objects.clear() self.indirect_objects.update(original_indirect) newdict = original_newdict self.update(newdict) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable() # load the trace fname_trace = fname + '.trace' if os.path.isfile(fname_trace): f = open(fname_trace, 'rb') private.active_trace = pickle.load(f) f.close()