Ejemplo n.º 1
0
 def findxref(fdata):
     ''' Find the cross reference section at the end of a file
     '''
     startloc = fdata.rfind('startxref')
     if startloc < 0:
         raise PdfParseError('Did not find "startxref" at end of file')
     source = PdfTokens(fdata, startloc, False)
     tok = source.next()
     assert tok == 'startxref'  # (We just checked this...)
     tableloc = source.next_default()
     if not tableloc.isdigit():
         source.exception('Expected table location')
     if source.next_default().rstrip().lstrip('%') != 'EOF':
         source.exception('Expected %%EOF')
     return startloc, PdfTokens(fdata, int(tableloc), True)
Ejemplo n.º 2
0
 def findxref(fdata):
     ''' Find the cross reference section at the end of a file
     '''
     startloc = fdata.rfind('startxref')
     if startloc < 0:
         raise PdfParseError('Did not find "startxref" at end of file')
     source = PdfTokens(fdata, startloc, False)
     tok = source.next()
     assert tok == 'startxref'  # (We just checked this...)
     tableloc = source.next_default()
     if not tableloc.isdigit():
         source.exception('Expected table location')
     if source.next_default().rstrip().lstrip('%') != 'EOF':
         source.exception('Expected %%EOF')
     return startloc, PdfTokens(fdata, int(tableloc), True)
Ejemplo n.º 3
0
    def __init__(self,
                 fname=None,
                 fdata=None,
                 decompress=False,
                 decrypt=False,
                 password='',
                 disable_gc=True,
                 slow_parsing=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                                        repr(lines[0]))

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                log.error('EOF mark not found: %s' % repr(fdata[-20:]))
                endloc = len(fdata) - 6
            endloc += 6
            junk = fdata[endloc:]
            # Done: It is not necessary to truncate the string.
            #       Some PDFs just use wrong EOF at the end to confuse parsers.
            #fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {
                '<<': self.readdict,
                '[': self.readarray,
                'endobj': self.empty_obj,
            }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken
            if slow_parsing == True:
                startloc = 0
                source = PdfTokens(fdata, startloc, True)
                private.source = source
                # Calling next() just for complete the structure of source by adding source.current.
                source.next()
                source.all_offsets = []
                source.obj_offsets = {}
                self.slow_parse_xref(source)

                # Done: add slow parsing for multiple trailers.
                trailer_loc = fdata.find('trailer')
                newdict = None
                while trailer_loc >= 0:
                    source.floc = trailer_loc
                    assert source.next() == "trailer"  # trailer
                    tok = source.next()  # <<
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    # Ignored the corrupted trailer.
                    try:
                        tmpdict = self.readdict(source)
                    except:
                        pass
                    else:
                        if not newdict:
                            newdict = tmpdict
                        else:
                            newdict.update(tmpdict)
                    finally:
                        trailer_loc = fdata.find('trailer', trailer_loc + 1)

                if newdict is not None:
                    newdict.Prev = None
                else:
                    source.exception("No trailer.")

                # the name in slowparsing is newdict
                self.update(newdict)
            else:
                """
                startloc, source = self.findxref(fdata)
                private.source = source
                xref_table_list = []
                source.all_offsets = []
                while 1:
                    source.obj_offsets = {}
                    # Loop through all the cross-reference tables
                    self.parsexref(source)
                    tok = source.next()
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    newdict = self.readdict(source)

                    token = source.next()
                    if token != 'startxref' and not xref_table_list:
                        source.warning('Expected "startxref" at end of xref table')

                    # Loop if any previously-written tables.
                    prev = newdict.Prev
                    if prev is None:
                        break
                    if not xref_table_list:
                        newdict.Prev = None
                        original_indirect = self.indirect_objects.copy()
                        original_newdict = newdict
                    source.floc = int(prev)
                    xref_table_list.append(source.obj_offsets)
                    self.indirect_objects.clear()

                if xref_table_list:
                    for update in reversed(xref_table_list):
                        source.obj_offsets.update(update)
                    self.indirect_objects.clear()
                    self.indirect_objects.update(original_indirect)
                    newdict = original_newdict
                # old name is newdict, below the new name is trailer
                self.update(newdict)
                """
                ### NEW STUFF BEGINS HERE
                startloc, source = self.findxref(fdata)
                private.source = source

                # Find all the xref tables/streams, and
                # then deal with them backwards.
                xref_list = []
                while 1:
                    source.obj_offsets = {}
                    trailer, is_stream = self.parsexref(source)
                    prev = trailer.Prev
                    if prev is None:
                        token = source.next()
                        if token != 'startxref' and not xref_list:
                            source.warning('Expected "startxref" '
                                           'at end of xref table')
                        break
                    xref_list.append((source.obj_offsets, trailer, is_stream))
                    source.floc = int(prev)
                #print 'xref_list:', xref_list
                #print 'trailer:', trailer
                # Handle document encryption
                private.crypt_filters = None
                if decrypt and PdfName.Encrypt in trailer:
                    identity_filter = crypt.IdentityCryptFilter()
                    crypt_filters = {PdfName.Identity: identity_filter}
                    private.crypt_filters = crypt_filters
                    private.stream_crypt_filter = identity_filter
                    private.string_crypt_filter = identity_filter

                    if not crypt.HAS_CRYPTO:
                        raise PdfParseError(
                            'Install PyCrypto to enable encryption support')

                    self._parse_encrypt_info(source, password, trailer)

                if is_stream:
                    self.load_stream_objects(trailer.object_streams)

                while xref_list:
                    later_offsets, later_trailer, is_stream = xref_list.pop()
                    source.obj_offsets.update(later_offsets)
                    if is_stream:
                        trailer.update(later_trailer)
                        self.load_stream_objects(later_trailer.object_streams)
                    else:
                        trailer = later_trailer

                trailer.Prev = None

                if (trailer.Version
                        and float(trailer.Version) > float(self.version)):
                    self.private.version = trailer.Version

                if decrypt:
                    self.decrypt_all()
                    trailer.Encrypt = None

                if is_stream:
                    self.Root = trailer.Root
                    self.Info = trailer.Info
                    self.ID = trailer.ID
                    self.Size = trailer.Size
                    self.Encrypt = trailer.Encrypt
                else:
                    self.update(trailer)

                ### NEW STUFF ENDS HERE

            # self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()

            # For compatibility with pyPdf
            private.numPages = len(self.pages)
        finally:
            if disable_gc:
                gc.enable()

            # load the trace
            fname_trace = fname + '.trace'
            if os.path.isfile(fname_trace):
                f = open(fname_trace, 'rb')
                private.active_trace = pickle.load(f)
                f.close()
Ejemplo n.º 4
0
    def __init__(self, fname=None, fdata=None, decompress=False,
                 disable_gc=True, slow_parsing=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                                        repr(lines[0]))

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                log.error('EOF mark not found: %s' %
                                    repr(fdata[-20:]))
                endloc = len(fdata) - 6
            endloc += 6
            junk = fdata[endloc:]
            # Done: It is not necessary to truncate the string.
            #       Some PDFs just use wrong EOF at the end to confuse parsers.
            #fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {'<<': self.readdict,
                               '[': self.readarray,
                               'endobj': self.empty_obj,
                               }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken
            if slow_parsing == True:
                startloc = 0
                source = PdfTokens(fdata, startloc, True)
                private.source = source
                # Calling next() just for complete the structure of source by adding source.current.
                source.next()
                source.all_offsets = []
                source.obj_offsets = {}
                self.slow_parse_xref(source)

                # Done: add slow parsing for multiple trailers.
                trailer_loc = fdata.find('trailer')
                newdict = None
                while trailer_loc >= 0:
                    source.floc = trailer_loc
                    assert source.next() == "trailer" # trailer
                    tok = source.next() # <<
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    # Ignored the corrupted trailer.
                    try:
                        tmpdict = self.readdict(source)
                    except:
                        pass
                    else:
                        if not newdict:
                            newdict = tmpdict
                        else:
                            newdict.update(tmpdict)
                    finally:
                        trailer_loc = fdata.find('trailer', trailer_loc+1)
                    
                if newdict is not None:
                    newdict.Prev = None
                else:
                    source.exception("No trailer.")
            else:
                startloc, source = self.findxref(fdata)
                private.source = source
                xref_table_list = []
                source.all_offsets = []
                while 1:
                    source.obj_offsets = {}
                    # Loop through all the cross-reference tables
                    self.parsexref(source)
                    tok = source.next()
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    newdict = self.readdict(source)

                    token = source.next()
                    if token != 'startxref' and not xref_table_list:
                        source.warning('Expected "startxref" at end of xref table')

                    # Loop if any previously-written tables.
                    prev = newdict.Prev
                    if prev is None:
                        break
                    if not xref_table_list:
                        newdict.Prev = None
                        original_indirect = self.indirect_objects.copy()
                        original_newdict = newdict
                    source.floc = int(prev)
                    xref_table_list.append(source.obj_offsets)
                    self.indirect_objects.clear()

                if xref_table_list:
                    for update in reversed(xref_table_list):
                        source.obj_offsets.update(update)
                    self.indirect_objects.clear()
                    self.indirect_objects.update(original_indirect)
                    newdict = original_newdict
            self.update(newdict)

            # self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()

            # For compatibility with pyPdf
            private.numPages = len(self.pages)
        finally:
            if disable_gc:
                gc.enable()

            # load the trace
            fname_trace = fname + '.trace'
            if os.path.isfile(fname_trace):
                f = open(fname_trace, 'rb')
                private.active_trace = pickle.load(f)
                f.close()
Ejemplo n.º 5
0
    def __init__(self,
                 fname=None,
                 fdata=None,
                 decompress=False,
                 disable_gc=True,
                 slow_parsing=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                                        repr(lines[0]))

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                log.error('EOF mark not found: %s' % repr(fdata[-20:]))
                endloc = len(fdata) - 6
            endloc += 6
            junk = fdata[endloc:]
            # Done: It is not necessary to truncate the string.
            #       Some PDFs just use wrong EOF at the end to confuse parsers.
            #fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {
                '<<': self.readdict,
                '[': self.readarray,
                'endobj': self.empty_obj,
            }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken
            if slow_parsing == True:
                startloc = 0
                source = PdfTokens(fdata, startloc, True)
                private.source = source
                # Calling next() just for complete the structure of source by adding source.current.
                source.next()
                source.all_offsets = []
                source.obj_offsets = {}
                self.slow_parse_xref(source)

                # Done: add slow parsing for multiple trailers.
                trailer_loc = fdata.find('trailer')
                newdict = None
                while trailer_loc >= 0:
                    source.floc = trailer_loc
                    assert source.next() == "trailer"  # trailer
                    tok = source.next()  # <<
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    # Ignored the corrupted trailer.
                    try:
                        tmpdict = self.readdict(source)
                    except:
                        pass
                    else:
                        if not newdict:
                            newdict = tmpdict
                        else:
                            newdict.update(tmpdict)
                    finally:
                        trailer_loc = fdata.find('trailer', trailer_loc + 1)

                if newdict is not None:
                    newdict.Prev = None
                else:
                    source.exception("No trailer.")
            else:
                startloc, source = self.findxref(fdata)
                private.source = source
                xref_table_list = []
                source.all_offsets = []
                while 1:
                    source.obj_offsets = {}
                    # Loop through all the cross-reference tables
                    self.parsexref(source)
                    tok = source.next()
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    newdict = self.readdict(source)

                    token = source.next()
                    if token != 'startxref' and not xref_table_list:
                        source.warning(
                            'Expected "startxref" at end of xref table')

                    # Loop if any previously-written tables.
                    prev = newdict.Prev
                    if prev is None:
                        break
                    if not xref_table_list:
                        newdict.Prev = None
                        original_indirect = self.indirect_objects.copy()
                        original_newdict = newdict
                    source.floc = int(prev)
                    xref_table_list.append(source.obj_offsets)
                    self.indirect_objects.clear()

                if xref_table_list:
                    for update in reversed(xref_table_list):
                        source.obj_offsets.update(update)
                    self.indirect_objects.clear()
                    self.indirect_objects.update(original_indirect)
                    newdict = original_newdict
            self.update(newdict)

            # self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()

            # For compatibility with pyPdf
            private.numPages = len(self.pages)
        finally:
            if disable_gc:
                gc.enable()

            # load the trace
            fname_trace = fname + '.trace'
            if os.path.isfile(fname_trace):
                f = open(fname_trace, 'rb')
                private.active_trace = pickle.load(f)
                f.close()