Esempio n. 1
0
def uncompress(mylist,
               leave_raw=False,
               warnings=set(),
               flate=PdfName.FlateDecode,
               decompress=decompressobj,
               isinstance=isinstance,
               list=list,
               len=len):
    ok = True
    for obj in streamobjects(mylist):
        ftype = obj.Filter
        if ftype is None:
            continue
        if isinstance(ftype, list) and len(ftype) == 1:
            # todo: multiple filters
            ftype = ftype[0]
        parms = obj.DecodeParms or obj.DP
        if ftype != flate:
            msg = ('Not decompressing: cannot use filter %s'
                   ' with parameters %s') % (repr(ftype), repr(parms))
            if msg not in warnings:
                warnings.add(msg)
                log.warning(msg)
            ok = False
        else:
            dco = decompress()
            try:
                data = dco.decompress(convert_store(obj.stream))
            except Exception as s:
                error = str(s)
            else:
                error = None
                if isinstance(parms, PdfArray):
                    oldparms = parms
                    parms = PdfDict()
                    for x in oldparms:
                        parms.update(x)
                if parms:
                    predictor = int(parms.Predictor or 1)
                    columns = int(parms.Columns or 1)
                    colors = int(parms.Colors or 1)
                    bpc = int(parms.BitsPerComponent or 8)
                    if 10 <= predictor <= 15:
                        data, error = flate_png(data, predictor, columns,
                                                colors, bpc)
                    elif predictor != 1:
                        error = ('Unsupported flatedecode predictor %s' %
                                 repr(predictor))
            if error is None:
                assert not dco.unconsumed_tail
                if dco.unused_data.strip():
                    error = ('Unconsumed compression data: %s' %
                             repr(dco.unused_data[:20]))
            if error is None:
                obj.Filter = None
                obj.stream = data if leave_raw else convert_load(data)
            else:
                log.error('%s %s' % (error, repr(obj.indirect)))
                ok = False
    return ok
Esempio n. 2
0
def uncompress(mylist, warnings=set(), flate = PdfName.FlateDecode,
                    decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len):
    ok = True
    for obj in streamobjects(mylist):
        ftype = obj.Filter
        if ftype is None:
            continue
        if isinstance(ftype, list) and len(ftype) == 1:
            # todo: multiple filters
            ftype = ftype[0]
        parms = obj.DecodeParms
        if ftype != flate or parms is not None:
            msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms))
            if msg not in warnings:
                warnings.add(msg)
                log.warning(msg)
            ok = False
        else:
            dco = decompress()
            error = None
            try:
                data = dco.decompress(obj.stream)
            except Exception, s:
                error = str(s)
            if error is None:
                assert not dco.unconsumed_tail
                if dco.unused_data.strip():
                    error = 'Unconsumed compression data: %s' % repr(dco.unused_data[:20])
            if error is None:
                obj.Filter = None
                obj.stream = data
            else:
                log.error('%s %s' % (error, repr(obj.indirect)))
Esempio n. 3
0
 def old_parsexref(self, source, int=int, range=range):
     ''' Parse (one of) the cross-reference file section(s)
     '''
     fdata = source.fdata
     setdefault = source.obj_offsets.setdefault
     add_offset = source.all_offsets.append
     next = source.next
     tok = next()
     if tok != 'xref':
         source.exception('Expected "xref" keyword')
     start = source.floc
     try:
         while 1:
             tok = next()
             if tok == 'trailer':
                 return
             startobj = int(tok)
             for objnum in range(startobj, startobj + int(next())):
                 offset = int(next())
                 generation = int(next())
                 inuse = next()
                 if inuse == 'n':
                     if offset != 0:
                         setdefault((objnum, generation), offset)
                         add_offset(offset)
                 elif inuse != 'f':
                     raise ValueError
     except:
         pass
     try:
         # Table formatted incorrectly.  See if
         # we can figure it out anyway.
         end = source.fdata.rindex('trailer', start)
         table = source.fdata[start:end].splitlines()
         for line in table:
             tokens = line.split()
             if len(tokens) == 2:
                 objnum = int(tokens[0])
             elif len(tokens) == 3:
                 offset, generation, inuse = (int(tokens[0]),
                                              int(tokens[1]), tokens[2])
                 if offset != 0 and inuse == 'n':
                     setdefault((objnum, generation), offset)
                     add_offset(offset)
                 objnum += 1
             elif tokens:
                 log.error('Invalid line in xref table: %s' % repr(line))
                 raise ValueError
         log.warning('Badly formatted xref table')
         source.floc = end
         source.next()
     except:
         source.floc = start
         source.exception('Invalid table format')
Esempio n. 4
0
 def parsexref(self, source, int=int, range=range):
     ''' Parse (one of) the cross-reference file section(s)
     '''
     fdata = source.fdata
     setdefault = source.obj_offsets.setdefault
     add_offset = source.all_offsets.append
     next = source.next
     tok = next()
     if tok != 'xref':
         source.exception('Expected "xref" keyword')
     start = source.floc
     try:
         while 1:
             tok = next()
             if tok == 'trailer':
                 return
             startobj = int(tok)
             for objnum in range(startobj, startobj + int(next())):
                 offset = int(next())
                 generation = int(next())
                 inuse = next()
                 if inuse == 'n':
                     if offset != 0:
                         setdefault((objnum, generation), offset)
                         add_offset(offset)
                 elif inuse != 'f':
                     raise ValueError
     except:
         pass
     try:
         # Table formatted incorrectly.  See if
         # we can figure it out anyway.
         end = source.fdata.rindex('trailer', start)
         table = source.fdata[start:end].splitlines()
         for line in table:
             tokens = line.split()
             if len(tokens) == 2:
                 objnum = int(tokens[0])
             elif len(tokens) == 3:
                 offset, generation, inuse = (int(tokens[0]),
                                              int(tokens[1]), tokens[2])
                 if offset != 0 and inuse == 'n':
                     setdefault((objnum, generation), offset)
                     add_offset(offset)
                 objnum += 1
             elif tokens:
                 log.error('Invalid line in xref table: %s' % repr(line))
                 raise ValueError
         log.warning('Badly formatted xref table')
         source.floc = end
         source.next()
     except:
         source.floc = start
         source.exception('Invalid table format')
Esempio n. 5
0
    def loadindirect(self, key):
        result = self.indirect_objects.get(key)
        if not isinstance(result, PdfIndirect):
            return result
        source = self.source
        offset = int(self.source.obj_offsets.get(key, '0'))
        if not offset:
            log.warning("Did not find PDF object %s" % (key, ))
            return None

        # Read the object header and validate it
        objnum, gennum = key
        source.floc = offset
        objid = source.multiple(3)
        ok = len(objid) == 3
        ok = ok and objid[0].isdigit() and int(objid[0]) == objnum
        ok = ok and objid[1].isdigit() and int(objid[1]) == gennum
        ok = ok and objid[2] == 'obj'
        if not ok:
            source.floc = offset
            source.next()
            objheader = '%d %d obj' % (objnum, gennum)
            fdata = source.fdata
            offset2 = (fdata.find('\n' + objheader) + 1
                       or fdata.find('\r' + objheader) + 1)
            if (not offset2 or
                    fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0):
                source.warning("Expected indirect object '%s'" % objheader)
                return None
            source.warning("Indirect object %s found at incorrect "
                           "offset %d (expected offset %d)" %
                           (objheader, offset2, offset))
            source.floc = offset2 + len(objheader)

        # Read the object, and call special code if it starts
        # an array or dictionary
        obj = source.next()
        func = self.special.get(obj)
        if func is not None:
            obj = func(source)

        self.indirect_objects[key] = obj
        self.deferred_objects.remove(key)

        # Mark the object as indirect, and
        # add it to the list of streams if it starts a stream
        obj.indirect = key
        tok = source.next()
        if tok != 'endobj':
            self.readstream(obj, self.findstream(obj, tok, source), source)
        return obj
Esempio n. 6
0
    def loadindirect(self, key):
        result = self.indirect_objects.get(key)
        if not isinstance(result, PdfIndirect):
            return result
        source = self.source
        offset = int(self.source.obj_offsets.get(key, '0'))
        if not offset:
            log.warning("Did not find PDF object %s" % (key,))
            return None

        # Read the object header and validate it
        objnum, gennum = key
        source.floc = offset
        objid = source.multiple(3)
        ok = len(objid) == 3
        ok = ok and objid[0].isdigit() and int(objid[0]) == objnum
        ok = ok and objid[1].isdigit() and int(objid[1]) == gennum
        ok = ok and objid[2] == 'obj'
        if not ok:
            source.floc = offset
            source.next()
            objheader = '%d %d obj' % (objnum, gennum)
            fdata = source.fdata
            offset2 = (fdata.find('\n' + objheader) + 1 or
                       fdata.find('\r' + objheader) + 1)
            if (not offset2 or
                    fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0):
                source.warning("Expected indirect object '%s'" % objheader)
                return None
            source.warning("Indirect object %s found at incorrect "
                           "offset %d (expected offset %d)" %
                           (objheader, offset2, offset))
            source.floc = offset2 + len(objheader)

        # Read the object, and call special code if it starts
        # an array or dictionary
        obj = source.next()
        func = self.special.get(obj)
        if func is not None:
            obj = func(source)

        self.indirect_objects[key] = obj
        self.deferred_objects.remove(key)

        # Mark the object as indirect, and
        # add it to the list of streams if it starts a stream
        obj.indirect = key
        tok = source.next()
        if tok != 'endobj':
            self.readstream(obj, self.findstream(obj, tok, source), source)
        return obj
Esempio n. 7
0
    def add(obj):
        ''' Add an object to our list, if it's an indirect
            object.  Just format it if not.
        '''
        # Can't hash dicts, so just hash the object ID
        objid = id(obj)

        # Automatically set stream objects to indirect
        if isinstance(obj, PdfDict):
            indirect = obj.indirect or (obj.stream is not None)
        else:
            indirect = getattr(obj, 'indirect', False)

        if not indirect:
            if objid in visited:
                log.warning('Replicating direct %s object, '
                            'should be indirect for optimal file size' %
                            type(obj))
                obj = type(obj)(obj)
                objid = id(obj)
            visiting(objid)
            result = format_obj(obj)
            leaving(objid)
            return result

        objnum = indirect_dict_get(objid)

        # If we haven't seen the object yet, we need to
        # add it to the indirect object list.
        if objnum is None:
            swapped = swapobj(objid)
            if swapped is not None:
                old_id = objid
                obj = swapped
                objid = id(obj)
                objnum = indirect_dict_get(objid)
                if objnum is not None:
                    indirect_dict[old_id] = objnum
                    return '%s 0 R' % objnum
            objnum = len(objlist) + 1
            objlist_append(None)
            indirect_dict[objid] = objnum
            deferred.append((objnum - 1, obj))
        return '%s 0 R' % objnum
Esempio n. 8
0
    def add(obj):
        ''' Add an object to our list, if it's an indirect
            object.  Just format it if not.
        '''
        # Can't hash dicts, so just hash the object ID
        objid = id(obj)

        # Automatically set stream objects to indirect
        if isinstance(obj, PdfDict):
            indirect = obj.indirect or (obj.stream is not None)
        else:
            indirect = getattr(obj, 'indirect', False)

        if not indirect:
            if objid in visited:
                log.warning('Replicating direct %s object, '
                            'should be indirect for optimal file size' %
                            type(obj))
                obj = type(obj)(obj)
                objid = id(obj)
            visiting(objid)
            result = format_obj(obj)
            leaving(objid)
            return result

        objnum = indirect_dict_get(objid)

        # If we haven't seen the object yet, we need to
        # add it to the indirect object list.
        if objnum is None:
            swapped = swapobj(objid)
            if swapped is not None:
                old_id = objid
                obj = swapped
                objid = id(obj)
                objnum = indirect_dict_get(objid)
                if objnum is not None:
                    indirect_dict[old_id] = objnum
                    return '%s 0 R' % objnum
            objnum = len(objlist) + 1
            objlist_append(None)
            indirect_dict[objid] = objnum
            deferred.append((objnum - 1, obj))
        return '%s 0 R' % objnum
Esempio n. 9
0
def uncompress(mylist,
               warnings=set(),
               flate=PdfName.FlateDecode,
               decompress=zlib.decompressobj,
               isinstance=isinstance,
               list=list,
               len=len):
    ok = True
    for obj in streamobjects(mylist):
        ftype = obj.Filter
        if ftype is None:
            continue
        if isinstance(ftype, list) and len(ftype) == 1:
            # todo: multiple filters
            ftype = ftype[0]
        parms = obj.DecodeParms
        if ftype != flate or parms is not None:
            msg = ('Not decompressing: cannot use filter %s '
                   'with parameters %s' % (repr(ftype), repr(parms)))
            if msg not in warnings:
                warnings.add(msg)
                log.warning(msg)
            ok = False
        else:
            dco = decompress()
            error = None
            try:
                data = dco.decompress(obj.stream)
            except Exception, s:
                error = str(s)
            if error is None:
                assert not dco.unconsumed_tail
                if dco.unused_data.strip():
                    error = ('Unconsumed compression data: %s' %
                             repr(dco.unused_data[:20]))
            if error is None:
                obj.Filter = None
                obj.stream = data
            else:
                log.error('%s %s' % (error, repr(obj.indirect)))
Esempio n. 10
0
    def parsexref(self, source, int=int, range=range):
        ''' Parse (one of) the cross-reference file section(s)
        '''
        def _pairs(array):
            i = 0
            while 1:
                yield int(array[i]), int(array[i + 1])
                i += 2
                if (i + 1) >= len(array):
                    break

        def convert_to_int(d, size):
            if size > 8:
                source.exception('Invalid size in convert_to_int')
            d = '\x00\x00\x00\x00\x00\x00\x00\x00' + d
            d = d[-8:]
            return struct.unpack('>q', d)[0]

        def read_trailer():
            tok = next()
            if tok != '<<':
                source.exception('Expected "<<" starting catalog')
            return self.readdict(source)

        setdefault = source.obj_offsets.setdefault
        add_offset = source.all_offsets.append
        next = source.next
        tok = next()
        if tok.isdigit():
            # check for xref stream object
            objid = source.multiple(2)
            ok = len(objid) == 2
            ok = ok and objid[0].isdigit()
            ok = ok and objid[1] == 'obj'
            if ok:
                next()  # start of dict
                obj = self.readdict(source)
                assert obj.Type == '/XRef'
                tok = next()
                end = source.floc + int(obj.Length)
                self.readstream(obj, self.findstream(obj, tok, source), source)
                uncompress([obj])
                num_pairs = obj.Index or PdfArray(['0', obj.Size])
                entry_sizes = [int(x) for x in obj.W]
                object_streams = {}
                for num, size in _pairs(num_pairs):
                    cnt = 0
                    stream_offset = 0
                    while cnt < size:
                        for i in range(len(entry_sizes)):
                            d = obj.stream[stream_offset:stream_offset +
                                           entry_sizes[i]]
                            stream_offset += entry_sizes[i]
                            di = convert_to_int(d, entry_sizes[i])
                            if i == 0:
                                xref_type = di
                                if xref_type == 0 and entry_sizes[0] == 0:
                                    xref_type = 1
                            elif i == 1:
                                if xref_type == 1:
                                    offset = di
                                elif xref_type == 2:
                                    objnum = di
                            elif i == 2:
                                if xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di
                        if xref_type == 1 and offset != 0:
                            setdefault((num, generation), offset)
                            add_offset(offset)
                        elif xref_type == 2:
                            if not objnum in object_streams:
                                object_streams[objnum] = []
                            object_streams[objnum].append(obstr_idx)
                        cnt += 1
                        num += 1

                self.load_stream_objects(object_streams)

                source.floc = end
                endit = source.multiple(2)
                if endit != ['endstream', 'endobj']:
                    source.exception('Expected endstream endobj')
                return obj
            else:
                source.exception('Expected xref stream')

        elif tok == 'xref':
            # plain xref table
            start = source.floc
            try:
                while 1:
                    tok = next()
                    if tok == 'trailer':
                        return read_trailer()
                    startobj = int(tok)
                    for objnum in range(startobj, startobj + int(next())):
                        offset = int(next())
                        generation = int(next())
                        inuse = next()
                        if inuse == 'n':
                            if offset != 0:
                                setdefault((objnum, generation), offset)
                                add_offset(offset)
                        elif inuse != 'f':
                            raise ValueError
            except:
                pass
            try:
                # Table formatted incorrectly.
                # See if we can figure it out anyway.
                end = source.fdata.rindex('trailer', start)
                table = source.fdata[start:end].splitlines()
                for line in table:
                    tokens = line.split()
                    if len(tokens) == 2:
                        objnum = int(tokens[0])
                    elif len(tokens) == 3:
                        offset, generation, inuse = \
                            int(tokens[0]), int(tokens[1]), tokens[2]
                        if offset != 0 and inuse == 'n':
                            setdefault((objnum, generation), offset)
                            add_offset(offset)
                        objnum += 1
                    elif tokens:
                        log.error('Invalid line in xref table: %s' %
                                  repr(line))
                        raise ValueError
                log.warning('Badly formatted xref table')
                source.floc = end
                next()
            except:
                source.floc = start
                source.exception('Invalid table format')

            return read_trailer()
        else:
            source.exception('Expected "xref" keyword or xref stream object')
Esempio n. 11
0
 def warning(self, *arg):
     log.warning(self.msg(*arg))
Esempio n. 12
0
def uncompress(mylist,
               warnings=set(),
               flate=PdfName.FlateDecode,
               decompress=zlib.decompressobj,
               isinstance=isinstance,
               list=list,
               len=len):
    ok = True
    for obj in streamobjects(mylist):
        ftype = obj.Filter
        if ftype is None:
            continue
        if isinstance(ftype, list) and len(ftype) == 1:
            # todo: multiple filters
            ftype = ftype[0]
        parms = obj.DecodeParms
        if ftype != flate or parms is not None:
            msg = 'Not decompressing: cannot use filter %s with parameters %s' % (
                repr(ftype), repr(parms))
            if msg not in warnings:
                warnings.add(msg)
                log.warning(msg)
            ok = False
        else:
            dco = decompress()
            error = None
            try:
                data = dco.decompress(obj.stream)
                if parms:
                    # try png predictor
                    predictor = int(parms['/Predictor']) or 1
                    # predictor 1 == no predictor
                    if predictor != 1:
                        columns = int(parms['/Columns'])
                        # PNG prediction:
                        if predictor >= 10 and predictor <= 15:
                            output = StringIO()
                            # PNG prediction can vary from row to row
                            rowlen = columns + 1
                            assert len(data) % rowlen == 0
                            prev_rowdata = (0, ) * rowlen
                            for row in xrange(len(data) / rowlen):
                                rowdata = [
                                    ord(x)
                                    for x in data[(row * rowlen):((row + 1) *
                                                                  rowlen)]
                                ]
                                filter_byte = rowdata[0]
                                if filter_byte == 0:
                                    pass
                                elif filter_byte == 1:
                                    for i in xrange(2, rowlen):
                                        rowdata[i] = (rowdata[i] +
                                                      rowdata[i - 1]) % 256
                                elif filter_byte == 2:
                                    for i in xrange(1, rowlen):
                                        rowdata[i] = (rowdata[i] +
                                                      prev_rowdata[i]) % 256
                                else:
                                    # unsupported PNG filter
                                    raise Exception(
                                        ('Unsupported PNG '
                                         'filter %r') % filter_byte)
                                prev_rowdata = rowdata
                                output.write(''.join(
                                    [chr(x) for x in rowdata[1:]]))
                            data = output.getvalue()
                        else:
                            # unsupported predictor
                            raise Exception(('Unsupported flatedecode'
                                             ' predictor %r') % predictor)

            except Exception, s:
                error = str(s)
            if error is None:
                assert not dco.unconsumed_tail
                if dco.unused_data.strip():
                    error = 'Unconsumed compression data: %s' % repr(
                        dco.unused_data[:20])
            if error is None:
                obj.Filter = None
                obj.stream = data
            else:
                log.error('%s %s' % (error, repr(obj.indirect)))
Esempio n. 13
0
    def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' % fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' % repr(lines[0]))

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:]))
            endloc += 6
            junk = fdata[endloc:]
            fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {'<<': self.readdict,
                               '[': self.readarray,
                               'endobj': self.empty_obj,
                               }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken


            startloc, source = self.findxref(fdata)
            private.source = source
            xref_table_list = []
            source.all_offsets = []
            while 1:
                source.obj_offsets = {}
                # Loop through all the cross-reference tables
                self.parsexref(source)
                tok = source.next()
                if tok != '<<':
                    source.exception('Expected "<<" starting catalog')

                newdict = self.readdict(source)

                token = source.next()
                if token != 'startxref' and not xref_table_list:
                    source.warning('Expected "startxref" at end of xref table')

                # Loop if any previously-written tables.
                prev = newdict.Prev
                if prev is None:
                    break
                if not xref_table_list:
                    newdict.Prev = None
                    original_indirect = self.indirect_objects.copy()
                    original_newdict = newdict
                source.floc = int(prev)
                xref_table_list.append(source.obj_offsets)
                self.indirect_objects.clear()

            if xref_table_list:
                for update in reversed(xref_table_list):
                    source.obj_offsets.update(update)
                self.indirect_objects.clear()
                self.indirect_objects.update(original_indirect)
                newdict = original_newdict
            self.update(newdict)

            #self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()

            # For compatibility with pyPdf
            private.numPages = len(self.pages)
        finally:
            if disable_gc:
                gc.enable()
Esempio n. 14
0
    def __init__(self, fname=None, fdata=None, decompress=False,
                 disable_gc=True, slow_parsing=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                                        repr(lines[0]))

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                log.error('EOF mark not found: %s' %
                                    repr(fdata[-20:]))
                endloc = len(fdata) - 6
            endloc += 6
            junk = fdata[endloc:]
            # Done: It is not necessary to truncate the string.
            #       Some PDFs just use wrong EOF at the end to confuse parsers.
            #fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {'<<': self.readdict,
                               '[': self.readarray,
                               'endobj': self.empty_obj,
                               }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken
            if slow_parsing == True:
                startloc = 0
                source = PdfTokens(fdata, startloc, True)
                private.source = source
                # Calling next() just for complete the structure of source by adding source.current.
                source.next()
                source.all_offsets = []
                source.obj_offsets = {}
                self.slow_parse_xref(source)

                # Done: add slow parsing for multiple trailers.
                trailer_loc = fdata.find('trailer')
                newdict = None
                while trailer_loc >= 0:
                    source.floc = trailer_loc
                    assert source.next() == "trailer" # trailer
                    tok = source.next() # <<
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    # Ignored the corrupted trailer.
                    try:
                        tmpdict = self.readdict(source)
                    except:
                        pass
                    else:
                        if not newdict:
                            newdict = tmpdict
                        else:
                            newdict.update(tmpdict)
                    finally:
                        trailer_loc = fdata.find('trailer', trailer_loc+1)
                    
                if newdict is not None:
                    newdict.Prev = None
                else:
                    source.exception("No trailer.")
            else:
                startloc, source = self.findxref(fdata)
                private.source = source
                xref_table_list = []
                source.all_offsets = []
                while 1:
                    source.obj_offsets = {}
                    # Loop through all the cross-reference tables
                    self.parsexref(source)
                    tok = source.next()
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    newdict = self.readdict(source)

                    token = source.next()
                    if token != 'startxref' and not xref_table_list:
                        source.warning('Expected "startxref" at end of xref table')

                    # Loop if any previously-written tables.
                    prev = newdict.Prev
                    if prev is None:
                        break
                    if not xref_table_list:
                        newdict.Prev = None
                        original_indirect = self.indirect_objects.copy()
                        original_newdict = newdict
                    source.floc = int(prev)
                    xref_table_list.append(source.obj_offsets)
                    self.indirect_objects.clear()

                if xref_table_list:
                    for update in reversed(xref_table_list):
                        source.obj_offsets.update(update)
                    self.indirect_objects.clear()
                    self.indirect_objects.update(original_indirect)
                    newdict = original_newdict
            self.update(newdict)

            # self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()

            # For compatibility with pyPdf
            private.numPages = len(self.pages)
        finally:
            if disable_gc:
                gc.enable()

            # load the trace
            fname_trace = fname + '.trace'
            if os.path.isfile(fname_trace):
                f = open(fname_trace, 'rb')
                private.active_trace = pickle.load(f)
                f.close()
Esempio n. 15
0
    def __init__(self, fname=None, fdata=None, decompress=False,
                 disable_gc=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                        repr(lines[0]))

            self.version = fdata[5:8]

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                raise PdfParseError('EOF mark not found: %s' %
                    repr(fdata[-20:]))
            endloc += 6
            junk = fdata[endloc:]
            fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {'<<': self.readdict,
                               '[': self.readarray,
                               'endobj': self.empty_obj,
                               }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken

            startloc, source = self.findxref(fdata)
            private.source = source
            xref_list = []
            source.all_offsets = []
            while 1:
                source.obj_offsets = {}

                # Loop through all the cross-reference tables/streams
                trailer = self.parsexref(source)

                # Loop if any previously-written xrefs.
                prev = trailer.Prev
                if prev is None:
                    token = source.next()
                    if token != 'startxref':
                        source.warning('Expected "startxref" at end of xref table')
                    break
                if not xref_list:
                    trailer.Prev = None
                    original_trailer = trailer
                source.floc = int(prev)
                xref_list.append(source.obj_offsets)

            if xref_list:
                for update in reversed(xref_list):
                    source.obj_offsets.update(update)
                trailer.update(original_trailer)

            if trailer.Version and \
                    float(trailer.Version) > float(self.version):
                self.version = trailer.Version

            trailer = PdfDict(
                Root=trailer.Root,
                Info=trailer.Info,
                ID=trailer.ID
                # TODO: add Encrypt when implemented
            )
            self.update(trailer)

            #self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()
        finally:
            if disable_gc:
                gc.enable()
Esempio n. 16
0
    def __init__(self,
                 fname=None,
                 fdata=None,
                 decompress=False,
                 decrypt=False,
                 password='',
                 disable_gc=True,
                 slow_parsing=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                                        repr(lines[0]))

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                log.error('EOF mark not found: %s' % repr(fdata[-20:]))
                endloc = len(fdata) - 6
            endloc += 6
            junk = fdata[endloc:]
            # Done: It is not necessary to truncate the string.
            #       Some PDFs just use wrong EOF at the end to confuse parsers.
            #fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {
                '<<': self.readdict,
                '[': self.readarray,
                'endobj': self.empty_obj,
            }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken
            if slow_parsing == True:
                startloc = 0
                source = PdfTokens(fdata, startloc, True)
                private.source = source
                # Calling next() just for complete the structure of source by adding source.current.
                source.next()
                source.all_offsets = []
                source.obj_offsets = {}
                self.slow_parse_xref(source)

                # Done: add slow parsing for multiple trailers.
                trailer_loc = fdata.find('trailer')
                newdict = None
                while trailer_loc >= 0:
                    source.floc = trailer_loc
                    assert source.next() == "trailer"  # trailer
                    tok = source.next()  # <<
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    # Ignored the corrupted trailer.
                    try:
                        tmpdict = self.readdict(source)
                    except:
                        pass
                    else:
                        if not newdict:
                            newdict = tmpdict
                        else:
                            newdict.update(tmpdict)
                    finally:
                        trailer_loc = fdata.find('trailer', trailer_loc + 1)

                if newdict is not None:
                    newdict.Prev = None
                else:
                    source.exception("No trailer.")

                # the name in slowparsing is newdict
                self.update(newdict)
            else:
                """
                startloc, source = self.findxref(fdata)
                private.source = source
                xref_table_list = []
                source.all_offsets = []
                while 1:
                    source.obj_offsets = {}
                    # Loop through all the cross-reference tables
                    self.parsexref(source)
                    tok = source.next()
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    newdict = self.readdict(source)

                    token = source.next()
                    if token != 'startxref' and not xref_table_list:
                        source.warning('Expected "startxref" at end of xref table')

                    # Loop if any previously-written tables.
                    prev = newdict.Prev
                    if prev is None:
                        break
                    if not xref_table_list:
                        newdict.Prev = None
                        original_indirect = self.indirect_objects.copy()
                        original_newdict = newdict
                    source.floc = int(prev)
                    xref_table_list.append(source.obj_offsets)
                    self.indirect_objects.clear()

                if xref_table_list:
                    for update in reversed(xref_table_list):
                        source.obj_offsets.update(update)
                    self.indirect_objects.clear()
                    self.indirect_objects.update(original_indirect)
                    newdict = original_newdict
                # old name is newdict, below the new name is trailer
                self.update(newdict)
                """
                ### NEW STUFF BEGINS HERE
                startloc, source = self.findxref(fdata)
                private.source = source

                # Find all the xref tables/streams, and
                # then deal with them backwards.
                xref_list = []
                while 1:
                    source.obj_offsets = {}
                    trailer, is_stream = self.parsexref(source)
                    prev = trailer.Prev
                    if prev is None:
                        token = source.next()
                        if token != 'startxref' and not xref_list:
                            source.warning('Expected "startxref" '
                                           'at end of xref table')
                        break
                    xref_list.append((source.obj_offsets, trailer, is_stream))
                    source.floc = int(prev)
                #print 'xref_list:', xref_list
                #print 'trailer:', trailer
                # Handle document encryption
                private.crypt_filters = None
                if decrypt and PdfName.Encrypt in trailer:
                    identity_filter = crypt.IdentityCryptFilter()
                    crypt_filters = {PdfName.Identity: identity_filter}
                    private.crypt_filters = crypt_filters
                    private.stream_crypt_filter = identity_filter
                    private.string_crypt_filter = identity_filter

                    if not crypt.HAS_CRYPTO:
                        raise PdfParseError(
                            'Install PyCrypto to enable encryption support')

                    self._parse_encrypt_info(source, password, trailer)

                if is_stream:
                    self.load_stream_objects(trailer.object_streams)

                while xref_list:
                    later_offsets, later_trailer, is_stream = xref_list.pop()
                    source.obj_offsets.update(later_offsets)
                    if is_stream:
                        trailer.update(later_trailer)
                        self.load_stream_objects(later_trailer.object_streams)
                    else:
                        trailer = later_trailer

                trailer.Prev = None

                if (trailer.Version
                        and float(trailer.Version) > float(self.version)):
                    self.private.version = trailer.Version

                if decrypt:
                    self.decrypt_all()
                    trailer.Encrypt = None

                if is_stream:
                    self.Root = trailer.Root
                    self.Info = trailer.Info
                    self.ID = trailer.ID
                    self.Size = trailer.Size
                    self.Encrypt = trailer.Encrypt
                else:
                    self.update(trailer)

                ### NEW STUFF ENDS HERE

            # self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()

            # For compatibility with pyPdf
            private.numPages = len(self.pages)
        finally:
            if disable_gc:
                gc.enable()

            # load the trace
            fname_trace = fname + '.trace'
            if os.path.isfile(fname_trace):
                f = open(fname_trace, 'rb')
                private.active_trace = pickle.load(f)
                f.close()
Esempio n. 17
0
 def warning(self, *arg):
     log.warning(self.msg(*arg))
Esempio n. 18
0
    def __init__(self,
                 fname=None,
                 fdata=None,
                 decompress=False,
                 disable_gc=True,
                 slow_parsing=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                                        repr(lines[0]))

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                log.error('EOF mark not found: %s' % repr(fdata[-20:]))
                endloc = len(fdata) - 6
            endloc += 6
            junk = fdata[endloc:]
            # Done: It is not necessary to truncate the string.
            #       Some PDFs just use wrong EOF at the end to confuse parsers.
            #fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {
                '<<': self.readdict,
                '[': self.readarray,
                'endobj': self.empty_obj,
            }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken
            if slow_parsing == True:
                startloc = 0
                source = PdfTokens(fdata, startloc, True)
                private.source = source
                # Calling next() just for complete the structure of source by adding source.current.
                source.next()
                source.all_offsets = []
                source.obj_offsets = {}
                self.slow_parse_xref(source)

                # Done: add slow parsing for multiple trailers.
                trailer_loc = fdata.find('trailer')
                newdict = None
                while trailer_loc >= 0:
                    source.floc = trailer_loc
                    assert source.next() == "trailer"  # trailer
                    tok = source.next()  # <<
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    # Ignored the corrupted trailer.
                    try:
                        tmpdict = self.readdict(source)
                    except:
                        pass
                    else:
                        if not newdict:
                            newdict = tmpdict
                        else:
                            newdict.update(tmpdict)
                    finally:
                        trailer_loc = fdata.find('trailer', trailer_loc + 1)

                if newdict is not None:
                    newdict.Prev = None
                else:
                    source.exception("No trailer.")
            else:
                startloc, source = self.findxref(fdata)
                private.source = source
                xref_table_list = []
                source.all_offsets = []
                while 1:
                    source.obj_offsets = {}
                    # Loop through all the cross-reference tables
                    self.parsexref(source)
                    tok = source.next()
                    if tok != '<<':
                        source.exception('Expected "<<" starting catalog')

                    newdict = self.readdict(source)

                    token = source.next()
                    if token != 'startxref' and not xref_table_list:
                        source.warning(
                            'Expected "startxref" at end of xref table')

                    # Loop if any previously-written tables.
                    prev = newdict.Prev
                    if prev is None:
                        break
                    if not xref_table_list:
                        newdict.Prev = None
                        original_indirect = self.indirect_objects.copy()
                        original_newdict = newdict
                    source.floc = int(prev)
                    xref_table_list.append(source.obj_offsets)
                    self.indirect_objects.clear()

                if xref_table_list:
                    for update in reversed(xref_table_list):
                        source.obj_offsets.update(update)
                    self.indirect_objects.clear()
                    self.indirect_objects.update(original_indirect)
                    newdict = original_newdict
            self.update(newdict)

            # self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()

            # For compatibility with pyPdf
            private.numPages = len(self.pages)
        finally:
            if disable_gc:
                gc.enable()

            # load the trace
            fname_trace = fname + '.trace'
            if os.path.isfile(fname_trace):
                f = open(fname_trace, 'rb')
                private.active_trace = pickle.load(f)
                f.close()
Esempio n. 19
0
def uncompress(mylist, warnings=set(), flate=PdfName.FlateDecode,
               decompress=zlib.decompressobj, isinstance=isinstance, list=list, len=len):
    ok = True
    for obj in streamobjects(mylist):
        ftype = obj.Filter
        if ftype is None:
            continue
        if isinstance(ftype, list) and len(ftype) == 1:
            # todo: multiple filters
            ftype = ftype[0]
        parms = obj.DecodeParms
        if ftype != flate or parms is not None:
            msg = 'Not decompressing: cannot use filter %s with parameters %s' % (repr(ftype), repr(parms))
            if msg not in warnings:
                warnings.add(msg)
                log.warning(msg)
            ok = False
        else:
            dco = decompress()
            error = None
            try:
                data = dco.decompress(obj.stream)
                if parms:
                    # try png predictor
                    predictor = int(parms['/Predictor']) or 1
                    # predictor 1 == no predictor
                    if predictor != 1:
                        columns = int(parms['/Columns'])
                        # PNG prediction:
                        if predictor >= 10 and predictor <= 15:
                            output = StringIO()
                            # PNG prediction can vary from row to row
                            rowlen = columns + 1
                            assert len(data) % rowlen == 0
                            prev_rowdata = (0,) * rowlen
                            for row in xrange(len(data) / rowlen):
                                rowdata = [ord(x) for x in
                                    data[(row * rowlen):((row + 1) * rowlen)]]
                                filter_byte = rowdata[0]
                                if filter_byte == 0:
                                    pass
                                elif filter_byte == 1:
                                    for i in xrange(2, rowlen):
                                        rowdata[i] = (rowdata[i] +
                                                      rowdata[i - 1]) % 256
                                elif filter_byte == 2:
                                    for i in xrange(1, rowlen):
                                        rowdata[i] = (rowdata[i] +
                                                      prev_rowdata[i]) % 256
                                else:
                                    # unsupported PNG filter
                                    raise Exception(('Unsupported PNG '
                                                    'filter %r') % filter_byte)
                                prev_rowdata = rowdata
                                output.write(''.join([chr(x) for x in
                                                      rowdata[1:]]))
                            data = output.getvalue()
                        else:
                            # unsupported predictor
                            raise Exception(('Unsupported flatedecode'
                                            ' predictor %r') % predictor)

            except Exception, s:
                error = str(s)
            if error is None:
                assert not dco.unconsumed_tail
                if dco.unused_data.strip():
                    error = 'Unconsumed compression data: %s' % repr(
                        dco.unused_data[:20])
            if error is None:
                obj.Filter = None
                obj.stream = data
            else:
                log.error('%s %s' % (error, repr(obj.indirect)))
Esempio n. 20
0
    def __init__(self,
                 fname=None,
                 fdata=None,
                 decompress=False,
                 disable_gc=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                                        repr(lines[0]))

            self.version = fdata[5:8]

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                raise PdfParseError('EOF mark not found: %s' %
                                    repr(fdata[-20:]))
            endloc += 6
            junk = fdata[endloc:]
            fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {
                '<<': self.readdict,
                '[': self.readarray,
                'endobj': self.empty_obj,
            }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken

            startloc, source = self.findxref(fdata)
            private.source = source
            xref_list = []
            source.all_offsets = []
            while 1:
                source.obj_offsets = {}

                # Loop through all the cross-reference tables/streams
                trailer = self.parsexref(source)

                # Loop if any previously-written xrefs.
                prev = trailer.Prev
                if prev is None:
                    token = source.next()
                    if token != 'startxref':
                        source.warning(
                            'Expected "startxref" at end of xref table')
                    break
                if not xref_list:
                    trailer.Prev = None
                    original_trailer = trailer
                source.floc = int(prev)
                xref_list.append(source.obj_offsets)

            if xref_list:
                for update in reversed(xref_list):
                    source.obj_offsets.update(update)
                trailer.update(original_trailer)

            if trailer.Version and \
                    float(trailer.Version) > float(self.version):
                self.version = trailer.Version

            trailer = PdfDict(Root=trailer.Root,
                              Info=trailer.Info,
                              ID=trailer.ID
                              # TODO: add Encrypt when implemented
                              )
            self.update(trailer)

            #self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()
        finally:
            if disable_gc:
                gc.enable()
Esempio n. 21
0
    def parsexref(self, source, int=int, range=range):
        ''' Parse (one of) the cross-reference file section(s)
        '''

        def _pairs(array):
            i = 0
            while 1:
                yield int(array[i]), int(array[i + 1])
                i += 2
                if (i + 1) >= len(array):
                    break

        def convert_to_int(d, size):
            if size > 8:
                source.exception('Invalid size in convert_to_int')
            d = '\x00\x00\x00\x00\x00\x00\x00\x00' + d
            d = d[-8:]
            return struct.unpack('>q', d)[0]

        def read_trailer():
            tok = next()
            if tok != '<<':
                source.exception('Expected "<<" starting catalog')
            return self.readdict(source)

        setdefault = source.obj_offsets.setdefault
        add_offset = source.all_offsets.append
        next = source.next
        tok = next()
        if tok.isdigit():
            # check for xref stream object
            objid = source.multiple(2)
            ok = len(objid) == 2
            ok = ok and objid[0].isdigit()
            ok = ok and objid[1] == 'obj'
            if ok:
                next()  # start of dict
                obj = self.readdict(source)
                assert obj.Type == '/XRef'
                tok = next()
                end = source.floc + int(obj.Length)
                self.readstream(obj, self.findstream(obj, tok, source), source)
                uncompress([obj])
                num_pairs = obj.Index or PdfArray(['0', obj.Size])
                entry_sizes = [int(x) for x in obj.W]
                object_streams = {}
                for num, size in _pairs(num_pairs):
                    cnt = 0
                    stream_offset = 0
                    while cnt < size:
                        for i in range(len(entry_sizes)):
                            d = obj.stream[stream_offset:stream_offset +
                                                         entry_sizes[i]]
                            stream_offset += entry_sizes[i]
                            di = convert_to_int(d, entry_sizes[i])
                            if i == 0:
                                xref_type = di
                                if xref_type == 0 and entry_sizes[0] == 0:
                                    xref_type = 1
                            elif i == 1:
                                if xref_type == 1:
                                    offset = di
                                elif xref_type == 2:
                                    objnum = di
                            elif i == 2:
                                if xref_type == 1:
                                    generation = di
                                elif xref_type == 2:
                                    obstr_idx = di
                        if xref_type == 1 and offset != 0:
                            setdefault((num, generation), offset)
                            add_offset(offset)
                        elif xref_type == 2:
                            if not objnum in object_streams:
                                object_streams[objnum] = []
                            object_streams[objnum].append(obstr_idx)
                        cnt += 1
                        num += 1

                self.load_stream_objects(object_streams)

                source.floc = end
                endit = source.multiple(2)
                if endit != ['endstream', 'endobj']:
                    source.exception('Expected endstream endobj')
                return obj
            else:
                source.exception('Expected xref stream')

        elif tok == 'xref':
            # plain xref table
            start = source.floc
            try:
                while 1:
                    tok = next()
                    if tok == 'trailer':
                        return read_trailer()
                    startobj = int(tok)
                    for objnum in range(startobj, startobj + int(next())):
                        offset = int(next())
                        generation = int(next())
                        inuse = next()
                        if inuse == 'n':
                            if offset != 0:
                                setdefault((objnum, generation), offset)
                                add_offset(offset)
                        elif inuse != 'f':
                            raise ValueError
            except:
                pass
            try:
                # Table formatted incorrectly.
                # See if we can figure it out anyway.
                end = source.fdata.rindex('trailer', start)
                table = source.fdata[start:end].splitlines()
                for line in table:
                    tokens = line.split()
                    if len(tokens) == 2:
                        objnum = int(tokens[0])
                    elif len(tokens) == 3:
                        offset, generation, inuse = \
                            int(tokens[0]), int(tokens[1]), tokens[2]
                        if offset != 0 and inuse == 'n':
                            setdefault((objnum, generation), offset)
                            add_offset(offset)
                        objnum += 1
                    elif tokens:
                        log.error('Invalid line in xref table: %s' %
                                  repr(line))
                        raise ValueError
                log.warning('Badly formatted xref table')
                source.floc = end
                next()
            except:
                source.floc = start
                source.exception('Invalid table format')

            return read_trailer()
        else:
            source.exception('Expected "xref" keyword or xref stream object')