Example #1
0
 def _getobj_parse(self, pos, objid):
     self._parser.seek(pos)
     (_, objid1) = self._parser.nexttoken()  # objid
     if objid1 != objid:
         raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
     (_, genno) = self._parser.nexttoken()  # genno
     (_, kwd) = self._parser.nexttoken()
     if kwd is not self.KEYWORD_OBJ:
         raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
     (_, obj) = self._parser.nextobject()
     return obj
Example #2
0
 def __init__(self, parser, password='', caching=True, fallback=True):
     "Set the document to use a given PDFParser object."
     self.caching = caching
     self.xrefs = []
     self.info = []
     self.catalog = None
     self.encryption = None
     self.decipher = None
     self._parser = None
     self._cached_objs = {}
     self._parsed_objs = {}
     self._parser = parser
     self._parser.set_document(self)
     self.is_printable = self.is_modifiable = self.is_extractable = True
     # Retrieve the information of each header that was appended
     # (maybe multiple times) at the end of the document.
     try:
         pos = self.find_xref(parser)
         self.read_xref_from(parser, pos, self.xrefs)
     except PDFNoValidXRef:
         fallback = True
     if fallback:
         parser.fallback = True
         xref = PDFXRefFallback()
         xref.load(parser)
         self.xrefs.append(xref)
     for xref in self.xrefs:
         trailer = xref.get_trailer()
         if not trailer:
             continue
         # If there's an encryption info, remember it.
         if 'Encrypt' in trailer:
             #assert not self.encryption
             self.encryption = (list_value(trailer['ID']),
                                dict_value(trailer['Encrypt']))
             self._initialize_password(password)
         if 'Info' in trailer:
             self.info.append(dict_value(trailer['Info']))
         if 'Root' in trailer:
             # Every PDF file must have exactly one /Root dictionary.
             self.catalog = dict_value(trailer['Root'])
             break
     else:
         raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
     if self.catalog.get('Type') is not LITERAL_CATALOG:
         if STRICT:
             raise PDFSyntaxError('Catalog not found!')
     return
Example #3
0
 def _get_objects(self, stream):
     if stream.get('Type') is not LITERAL_OBJSTM:
         if STRICT:
             raise PDFSyntaxError('Not a stream object: %r' % stream)
     try:
         n = stream['N']
     except KeyError:
         if STRICT:
             raise PDFSyntaxError('N is not defined: %r' % stream)
         n = 0
     parser = PDFStreamParser(stream.get_data())
     parser.set_document(self)
     objs = []
     try:
         while 1:
             (_, obj) = parser.nextobject()
             objs.append(obj)
     except PSEOF:
         pass
     return (objs, n)
Example #4
0
 def _getobj_objstm(self, stream, index, objid):
     if stream.objid in self._parsed_objs:
         (objs, n) = self._parsed_objs[stream.objid]
     else:
         (objs, n) = self._get_objects(stream)
         if self.caching:
             self._parsed_objs[stream.objid] = (objs, n)
     i = n * 2 + index
     try:
         obj = objs[i]
     except IndexError:
         raise PDFSyntaxError('index too big: %r' % index)
     return obj
Example #5
0
 def load(self, parser, debug=0):
     parser.seek(0)
     while 1:
         try:
             (pos, line) = parser.nextline()
         except PSEOF:
             break
         if line.startswith('trailer'):
             parser.seek(pos)
             self.load_trailer(parser)
             if 1 <= debug:
                 print >> sys.stderr, 'trailer: %r' % self.get_trailer()
             break
         m = self.PDFOBJ_CUE.match(line)
         if not m:
             continue
         (objid, genno) = m.groups()
         objid = int(objid)
         genno = int(genno)
         self.offsets[objid] = (None, pos, genno)
         # expand ObjStm.
         parser.seek(pos)
         (_, obj) = parser.nextobject()
         if isinstance(obj,
                       PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
             stream = stream_value(obj)
             try:
                 n = stream['N']
             except KeyError:
                 if STRICT:
                     raise PDFSyntaxError('N is not defined: %r' % stream)
                 n = 0
             parser1 = PDFStreamParser(stream.get_data())
             objs = []
             try:
                 while 1:
                     (_, obj) = parser1.nextobject()
                     objs.append(obj)
             except PSEOF:
                 pass
             n = min(n, len(objs) // 2)
             for index in xrange(n):
                 objid1 = objs[index * 2]
                 self.offsets[objid1] = (objid, index, 0)
     return
Example #6
0
 def load(self, parser, debug=0):
     (_, objid) = parser.nexttoken()  # ignored
     (_, genno) = parser.nexttoken()  # ignored
     (_, kwd) = parser.nexttoken()
     (_, stream) = parser.nextobject()
     if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
         raise PDFNoValidXRef('Invalid PDF stream spec.')
     size = stream['Size']
     index_array = stream.get('Index', (0, size))
     if len(index_array) % 2 != 0:
         raise PDFSyntaxError('Invalid index number')
     self.ranges.extend(choplist(2, index_array))
     (self.fl1, self.fl2, self.fl3) = stream['W']
     self.data = stream.get_data()
     self.entlen = self.fl1+self.fl2+self.fl3
     self.trailer = stream.attrs
     if 1 <= debug:
         print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
                              (', '.join(map(repr, self.ranges)),
                              self.fl1, self.fl2, self.fl3))
     return