Example #1
0
 def init_params(self):
     self.v = int_value(self.param.get('V', 0))
     self.r = int_value(self.param['R'])
     self.p = int_value(self.param['P'])
     self.o = str_value(self.param['O'])
     self.u = str_value(self.param['U'])
     self.length = int_value(self.param.get('Length', 40))
Example #2
0
 def read_xref_from(self, start, xrefs):
     """Reads XRefs from the given location."""
     self.seek(start)
     self.reset()
     try:
         (pos, token) = self.nexttoken()
     except PSEOF:
         raise PDFNoValidXRef('Unexpected EOF')
     if 2 <= self.debug:
         print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
     if isinstance(token, int):
         # XRefStream: PDF-1.5
         self.seek(pos)
         self.reset()
         xref = PDFXRefStream()
         xref.load(self, debug=self.debug)
     else:
         if token is self.KEYWORD_XREF:
             self.nextline()
         xref = PDFXRef()
         xref.load(self, debug=self.debug)
     xrefs.append(xref)
     trailer = xref.get_trailer()
     if 1 <= self.debug:
         print >>sys.stderr, 'trailer: %r' % trailer
     if 'XRefStm' in trailer:
         pos = int_value(trailer['XRefStm'])
         self.read_xref_from(pos, xrefs)
     if 'Prev' in trailer:
         # find previous xref
         pos = int_value(trailer['Prev'])
         self.read_xref_from(pos, xrefs)
     return
Example #3
0
 def read_xref_from(self, parser, start, xrefs):
     """Reads XRefs from the given location."""
     parser.seek(start)
     parser.reset()
     try:
         (pos, token) = parser.nexttoken()
     except PSEOF:
         raise PDFNoValidXRef('Unexpected EOF')
     logging.info('read_xref_from: start=%d, token=%r' % (start, token))
     if isinstance(token, int):
         # XRefStream: PDF-1.5
         parser.seek(pos)
         parser.reset()
         xref = PDFXRefStream()
         xref.load(parser)
     else:
         if token is parser.KEYWORD_XREF:
             parser.nextline()
         xref = PDFXRef()
         xref.load(parser)
     xrefs.append(xref)
     trailer = xref.get_trailer()
     logging.info('trailer: %r' % trailer)
     if 'XRefStm' in trailer:
         pos = int_value(trailer['XRefStm'])
         self.read_xref_from(parser, pos, xrefs)
     if 'Prev' in trailer:
         # find previous xref
         pos = int_value(trailer['Prev'])
         self.read_xref_from(parser, pos, xrefs)
     return
Example #4
0
 def read_xref_from(self, parser, start, xrefs):
     """Reads XRefs from the given location."""
     parser.seek(start)
     parser.reset()
     try:
         (pos, token) = parser.nexttoken()
     except PSEOF:
         raise PDFNoValidXRef('Unexpected EOF')
     if 2 <= self.debug:
         print >> sys.stderr, 'read_xref_from: start=%d, token=%r' % (start,
                                                                      token)
     if isinstance(token, int):
         # XRefStream: PDF-1.5
         parser.seek(pos)
         parser.reset()
         xref = PDFXRefStream()
         xref.load(parser, debug=self.debug)
     else:
         if token is parser.KEYWORD_XREF:
             parser.nextline()
         xref = PDFXRef()
         xref.load(parser, debug=self.debug)
     xrefs.append(xref)
     trailer = xref.get_trailer()
     if 1 <= self.debug:
         print >> sys.stderr, 'trailer: %r' % trailer
     if 'XRefStm' in trailer:
         pos = int_value(trailer['XRefStm'])
         self.read_xref_from(parser, pos, xrefs)
     if 'Prev' in trailer:
         # find previous xref
         pos = int_value(trailer['Prev'])
         self.read_xref_from(parser, pos, xrefs)
     return
Example #5
0
    def initialize(self, password=''):
        """Perform the initialization with a given password.

        This step is mandatory even if there's no password associated with the document.
        """
        if not self.encryption:
            self.is_printable = self.is_modifiable = self.is_extractable = True
            return
        (docid, param) = self.encryption
        if literal_name(param.get('Filter')) != 'Standard':
            raise PDFEncryptionError('Unknown filter: param=%r' % param)
        V = int_value(param.get('V', 0))
        if not (V == 1 or V == 2):
            raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
        length = int_value(param.get('Length', 40))  # Key length (bits)
        O = str_value(param['O'])
        R = int_value(param['R'])  # Revision
        if 5 <= R:
            raise PDFEncryptionError('Unknown revision: %r' % R)
        U = str_value(param['U'])
        P = int_value(param['P'])
        self.is_printable = bool(P & 4)
        self.is_modifiable = bool(P & 8)
        self.is_extractable = bool(P & 16)
        # Algorithm 3.2
        password = (password + self.PASSWORD_PADDING)[:32]  # 1
        md5hash = md5.md5(password)  # 2
        md5hash.update(O)  # 3
        md5hash.update(struct.pack('<l', P))  # 4
        md5hash.update(docid[0])  # 5
        if 4 <= R:
            # 6
            raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
        if 3 <= R:
            # 8
            for _ in xrange(50):
                md5hash = md5.md5(md5hash.digest()[:length / 8])
        key = md5hash.digest()[:length / 8]
        if R == 2:
            # Algorithm 3.4
            u1 = Arcfour(key).process(self.PASSWORD_PADDING)
        elif R == 3:
            # Algorithm 3.5
            md5hash = md5.md5(self.PASSWORD_PADDING)  # 2
            md5hash.update(docid[0])  # 3
            x = Arcfour(key).process(md5hash.digest()[:16])  # 4
            for i in xrange(1, 19 + 1):
                k = ''.join(chr(ord(c) ^ i) for c in key)
                x = Arcfour(k).process(x)
            u1 = x + x  # 32bytes total
        if R == 2:
            is_authenticated = (u1 == U)
        else:
            is_authenticated = (u1[:16] == U[:16])
        if not is_authenticated:
            raise PDFPasswordIncorrect
        self.decrypt_key = key
        self.decipher = self.decrypt_rc4  # XXX may be AES
Example #6
0
 def initialize(self, password=''):
     if not self.encryption:
         self.is_printable = self.is_modifiable = self.is_extractable = True
         self._initialized = True
         return
     (docid, param) = self.encryption
     if literal_name(param['Filter']) != 'Standard':
         raise PDFEncryptionError('Unknown filter: param=%r' % param)
     V = int_value(param.get('V', 0))
     if not (V == 1 or V == 2):
         raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
     length = int_value(param.get('Length', 40))  # Key length (bits)
     O = str_value(param['O'])
     R = int_value(param['R'])  # Revision
     if 5 <= R:
         raise PDFEncryptionError('Unknown revision: %r' % R)
     U = str_value(param['U'])
     P = int_value(param['P'])
     self.is_printable = bool(P & 4)
     self.is_modifiable = bool(P & 8)
     self.is_extractable = bool(P & 16)
     # Algorithm 3.2
     password = (password + self.PASSWORD_PADDING)[:32]  # 1
     hash = md5.md5(password)  # 2
     hash.update(O)  # 3
     hash.update(struct.pack('<l', P))  # 4
     hash.update(docid[0])  # 5
     if 4 <= R:
         # 6
         raise PDFNotImplementedError(
             'Revision 4 encryption is currently unsupported')
     if 3 <= R:
         # 8
         for _ in xrange(50):
             hash = md5.md5(hash.digest()[:length / 8])
     key = hash.digest()[:length / 8]
     if R == 2:
         # Algorithm 3.4
         u1 = Arcfour(key).process(self.PASSWORD_PADDING)
     elif R == 3:
         # Algorithm 3.5
         hash = md5.md5(self.PASSWORD_PADDING)  # 2
         hash.update(docid[0])  # 3
         x = Arcfour(key).process(hash.digest()[:16])  # 4
         for i in xrange(1, 19 + 1):
             k = ''.join(chr(ord(c) ^ i) for c in key)
             x = Arcfour(k).process(x)
         u1 = x + x  # 32bytes total
     if R == 2:
         is_authenticated = (u1 == U)
     else:
         is_authenticated = (u1[:16] == U[:16])
     if not is_authenticated:
         raise PDFPasswordIncorrect
     self.decrypt_key = key
     self.decipher = self.decrypt_rc4  # XXX may be AES
     self._initialized = True
     return
Example #7
0
 def __init__(self, rsrcmgr, spec):
     firstchar = int_value(spec.get("FirstChar", 0))
     lastchar = int_value(spec.get("LastChar", 0))
     widths = list_value(spec.get("Widths", [0] * 256))
     widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     if "FontDescriptor" in spec:
         descriptor = dict_value(spec["FontDescriptor"])
     else:
         descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     self.matrix = tuple(list_value(spec.get("FontMatrix")))
     (_, self.descent, _, self.ascent) = self.bbox
     (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
     return
Example #8
0
 def __init__(self, rsrcmgr, spec):
     firstchar = int_value(spec.get('FirstChar', 0))
     lastchar = int_value(spec.get('LastChar', 0))
     widths = list_value(spec.get('Widths', [0] * 256))
     widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     if 'FontDescriptor' in spec:
         descriptor = dict_value(spec['FontDescriptor'])
     else:
         descriptor = {'Ascent': 0, 'Descent': 0,
                       'FontBBox': spec['FontBBox']}
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     self.matrix = tuple(list_value(spec.get('FontMatrix')))
     (_, self.descent, _, self.ascent) = self.bbox
     (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
Example #9
0
 def __init__(self, rsrcmgr, spec):
     firstchar = int_value(spec.get('FirstChar', 0))
     lastchar = int_value(spec.get('LastChar', 0))
     widths = list_value(spec.get('Widths', [0]*256))
     widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
     if 'FontDescriptor' in spec:
         descriptor = dict_value(spec['FontDescriptor'])
     else:
         descriptor = {'Ascent':0, 'Descent':0,
                       'FontBBox':spec['FontBBox']}
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     self.matrix = tuple(list_value(spec.get('FontMatrix')))
     (_,self.descent,_,self.ascent) = self.bbox
     (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
     return
Example #10
0
 def __init__(self, doc, pageid, attrs):
     """Initialize a page object.
     
     doc: a PDFDocument object.
     pageid: any Python object that can uniquely identify the page.
     attrs: a dictionary of page attributes.
     """
     self.doc = doc
     self.pageid = pageid
     self.attrs = dict_value(attrs)
     self.lastmod = resolve1(self.attrs.get('LastModified'))
     self.resources = resolve1(self.attrs['Resources'])
     self.mediabox = resolve1(self.attrs['MediaBox'])
     if 'CropBox' in self.attrs:
         self.cropbox = resolve1(self.attrs['CropBox'])
     else:
         self.cropbox = self.mediabox
     self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
     self.annots = self.attrs.get('Annots')
     self.beads = self.attrs.get('B')
     if 'Contents' in self.attrs:
         contents = resolve1(self.attrs['Contents'])
     else:
         contents = []
     if not isinstance(contents, list):
         contents = [ contents ]
     self.contents = contents
     return
Example #11
0
    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources = resolve1(self.attrs['Resources'])
        self.mediabox = resolve1(self.attrs['MediaBox'])
        if 'CropBox' in self.attrs:
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
        self.rotate = (int_value(self.attrs.get('Rotate', 0)) + 360) % 360
        self.annots = self.attrs.get('Annots')
        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
            contents = resolve1(self.attrs['Contents'])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [contents]
        self.contents = contents
        return
Example #12
0
 def do_keyword(self, pos, token):
     """Handles PDF-related keywords."""
     if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
         self.add_results(*self.pop(1))
     elif token is self.KEYWORD_ENDOBJ:
         self.add_results(*self.pop(4))
     elif token is self.KEYWORD_NULL:
         # null object
         self.push((pos, None))
     elif token is self.KEYWORD_R:
         # reference to indirect object
         try:
             ((_, objid), (_, genno)) = self.pop(2)
             (objid, genno) = (int(objid), int(genno))
             obj = PDFObjRef(self.doc, objid, genno)
             self.push((pos, obj))
         except PSSyntaxError:
             pass
     elif token is self.KEYWORD_STREAM:
         # stream object
         ((_, dic),) = self.pop(1)
         dic = dict_value(dic)
         objlen = 0
         if not self.fallback:
             try:
                 objlen = int_value(dic['Length'])
             except KeyError:
                 handle_error(PDFSyntaxError, '/Length is undefined: %r' % dic)
         self.seek(pos)
         try:
             (_, line) = self.nextline()  # 'stream'
         except PSEOF:
             handle_error(PDFSyntaxError, 'Unexpected EOF')
             return
         pos += len(line)
         self.fp.seek(pos)
         data = self.fp.read(objlen)
         self.seek(pos + objlen)
         while 1:
             try:
                 (linepos, line) = self.nextline()
             except PSEOF:
                 handle_error(PDFSyntaxError, 'Unexpected EOF')
                 break
             if 'endstream' in line:
                 i = line.index('endstream')
                 objlen += i
                 data += line[:i]
                 break
             objlen += len(line)
             data += line
         self.seek(pos+objlen)
         # XXX limit objlen not to exceed object boundary
         log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
         obj = PDFStream(dic, data, self.doc.decipher)
         self.push((pos, obj))
     else:
         # others
         self.push((pos, token))
Example #13
0
 def __init__(self, rsrc, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0]*256))
         widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     return
Example #14
0
 def __init__(self, rsrc, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0] * 256))
         widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     return
Example #15
0
 def __init__(self, rsrcmgr, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         handle_error(PDFFontError, 'BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0] * 256))
         widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     if 'Encoding' not in spec and 'FontFile' in descriptor:
         # try to recover the missing encoding info from the font file.
         self.fontfile = stream_value(descriptor.get('FontFile'))
         length1 = int_value(self.fontfile['Length1'])
         data = self.fontfile.get_data()[:length1]
         parser = Type1FontHeaderParser(StringIO(data))
         self.cid2unicode = parser.get_encoding()
Example #16
0
 def __init__(self, descriptor, widths, default_width=None):
     self.descriptor = descriptor
     self.widths = widths
     self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
     if isinstance(self.fontname, PSLiteral):
         self.fontname = literal_name(self.fontname)
     self.flags = int_value(descriptor.get('Flags', 0))
     self.ascent = num_value(descriptor.get('Ascent', 0))
     self.descent = num_value(descriptor.get('Descent', 0))
     self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
     self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
     self.leading = num_value(descriptor.get('Leading', 0))
     self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
     self.hscale = self.vscale = .001
Example #17
0
 def __init__(self, rsrcmgr, spec):
     try:
         self.basefont = literal_name(spec["BaseFont"])
     except KeyError:
         if STRICT:
             raise PDFFontError("BaseFont is missing")
         self.basefont = "unknown"
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get("FontDescriptor", {}))
         firstchar = int_value(spec.get("FirstChar", 0))
         lastchar = int_value(spec.get("LastChar", 255))
         widths = list_value(spec.get("Widths", [0] * 256))
         widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     if "Encoding" not in spec and "FontFile" in descriptor:
         # try to recover the missing encoding info from the font file.
         self.fontfile = stream_value(descriptor.get("FontFile"))
         length1 = int_value(self.fontfile["Length1"])
         data = self.fontfile.get_data()[:length1]
         parser = Type1FontHeaderParser(StringIO(data))
         self.cid2unicode = parser.get_encoding()
     return
Example #18
0
 def __init__(self, rsrcmgr, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0]*256))
         widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     if 'Encoding' not in spec and 'FontFile' in descriptor:
         # try to recover the missing encoding info from the font file.
         self.fontfile = stream_value(descriptor.get('FontFile'))
         length1 = int_value(self.fontfile['Length1'])
         data = self.fontfile.get_data()[:length1]
         parser = Type1FontHeaderParser(StringIO(data))
         self.cid2unicode = parser.get_encoding()
     return
Example #19
0
 def __init__(self, descriptor, widths, default_width=None):
     self.descriptor = descriptor
     self.widths = widths
     self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
     if isinstance(self.fontname, PSLiteral):
         self.fontname = literal_name(self.fontname)
     self.flags = int_value(descriptor.get('Flags', 0))
     self.ascent = num_value(descriptor.get('Ascent', 0))
     self.descent = num_value(descriptor.get('Descent', 0))
     self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
     self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
     self.leading = num_value(descriptor.get('Leading', 0))
     self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
     self.hscale = self.vscale = .001
     return
Example #20
0
 def _initialize_password(self, password=''):
     (docid, param) = self.encryption
     if literal_name(param.get('Filter')) != 'Standard':
         raise PDFEncryptionError('Unknown filter: param=%r' % param)
     v = int_value(param.get('V', 0))
     factory = self.security_handler_registry.get(v)
     if factory is None:
         raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
     handler = factory(docid, param, password)
     self.decipher = handler.decrypt
     self.is_printable = handler.is_printable()
     self.is_modifiable = handler.is_modifiable()
     self.is_extractable = handler.is_extractable()
     self._parser.fallback = False # need to read streams with exact length
     return
Example #21
0
 def __init__(self, descriptor, widths, default_width=None):
     self.descriptor = descriptor
     self.widths = widths
     self.fontname = resolve1(descriptor.get("FontName", "unknown"))
     if isinstance(self.fontname, PSLiteral):
         self.fontname = literal_name(self.fontname)
     self.flags = int_value(descriptor.get("Flags", 0))
     self.ascent = num_value(descriptor.get("Ascent", 0))
     self.descent = num_value(descriptor.get("Descent", 0))
     self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
     self.default_width = default_width or num_value(descriptor.get("MissingWidth", 0))
     self.leading = num_value(descriptor.get("Leading", 0))
     self.bbox = list_value(descriptor.get("FontBBox", (0, 0, 0, 0)))
     self.hscale = self.vscale = 0.001
     return
Example #22
0
    def do_keyword(self, pos, token):
        """Handles PDF-related keywords."""
        
        if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
            self.add_results(*self.pop(1))
        
        elif token is self.KEYWORD_ENDOBJ:
            self.add_results(*self.pop(4))

        elif token is self.KEYWORD_NULL:
            # null object
            self.push((pos, None))

        elif token is self.KEYWORD_R:
            # reference to indirect object
            try:
                ((_,objid), (_,genno)) = self.pop(2)
                (objid, genno) = (int(objid), int(genno))
                obj = PDFObjRef(self.doc, objid, genno)
                self.push((pos, obj))
            except PSSyntaxError:
                pass

        elif token is self.KEYWORD_STREAM:
            # stream object
            ((_,dic),) = self.pop(1)
            dic = dict_value(dic)
            objlen = 0
            if not self.fallback:
                try:
                    objlen = int_value(dic['Length'])
                except KeyError:
                    if STRICT:
                        raise PDFSyntaxError('/Length is undefined: %r' % dic)
            self.seek(pos)
            try:
                (_, line) = self.nextline()  # 'stream'
            except PSEOF:
                if STRICT:
                    raise PDFSyntaxError('Unexpected EOF')
                return
            pos += len(line)
            self.fp.seek(pos)
            data = self.fp.read(objlen)
            self.seek(pos+objlen)
            while 1:
                try:
                    (linepos, line) = self.nextline()
                except PSEOF:
                    if STRICT:
                        raise PDFSyntaxError('Unexpected EOF')
                    break
                if 'endstream' in line:
                    i = line.index('endstream')
                    objlen += i
                    data += line[:i]
                    break
                objlen += len(line)
                data += line
            self.seek(pos+objlen)
            # XXX limit objlen not to exceed object boundary
            if 2 <= self.debug:
                print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
                      (pos, objlen, dic, data[:10])
            obj = PDFStream(dic, data, self.doc.decipher)
            self.push((pos, obj))

        else:
            # others
            self.push((pos, token))
        
        return
Example #23
0
    def do_keyword(self, pos, token):
        if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
            self.add_results(*self.pop(1))
            return
        if token is self.KEYWORD_ENDOBJ:
            self.add_results(*self.pop(4))
            return

        if token is self.KEYWORD_R:
            # reference to indirect object
            try:
                ((_,objid), (_,genno)) = self.pop(2)
                (objid, genno) = (int(objid), int(genno))
                obj = PDFObjRef(self.doc, objid, genno)
                self.push((pos, obj))
            except PSSyntaxError:
                pass
            return

        if token is self.KEYWORD_STREAM:
            # stream object
            ((_,dic),) = self.pop(1)
            dic = dict_value(dic)
            try:
                objlen = int_value(dic['Length'])
            except KeyError:
                if STRICT:
                    raise PDFSyntaxError('/Length is undefined: %r' % dic)
                objlen = 0
            self.seek(pos)
            try:
                (_, line) = self.nextline()  # 'stream'
            except PSEOF:
                if STRICT:
                    raise PDFSyntaxError('Unexpected EOF')
                return
            pos += len(line)
            self.fp.seek(pos)
            data = self.fp.read(objlen)
            self.seek(pos+objlen)
            while 1:
                try:
                    (linepos, line) = self.nextline()
                except PSEOF:
                    if STRICT:
                        raise PDFSyntaxError('Unexpected EOF')
                    break
                if 'endstream' in line:
                    i = line.index('endstream')
                    objlen += i
                    data += line[:i]
                    break
                objlen += len(line)
                data += line
            self.seek(pos+objlen)
            if 1 <= self.debug:
                print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
                      (pos, objlen, dic, data[:10])
            obj = PDFStream(dic, data, self.doc.decipher)
            self.push((pos, obj))
            return

        # others
        self.push((pos, token))
        return
Example #24
0
    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources = resolve1(self.attrs['Resources'])
        self.mediabox = resolve1(self.attrs['MediaBox'])
        if 'CropBox' in self.attrs:
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
        self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
        self.annots = list_value(self.attrs.get('Annots'))
        self.widgets = []

        def get_widget_type(obj):
            if 'FT' in obj:
                if obj.get('FT') is LITERAL_TX:
                    return 'text'
                elif obj.get('FT') is LITERAL_CH or obj.get('FT') is LITERAL_BTN:
                    return 'checkbox'
                elif obj.get('FT') is LITERAL_SIG:
                    return 'signature'
                else:
                    return None

            
        def create_widget(obj):
            if 'Subtype' in obj and obj.get('Subtype') is LITERAL_WIDGET:
                wtype = get_widget_type(obj)
                if wtype:
                    return LTWidget(list_value(obj.get('Rect')), wtype, str_value(obj.get('T')))
                elif 'Parent' in obj:
                    p = resolve1(obj.get('Parent'))
                    return LTWidget(list_value(obj.get('Rect')), get_widget_type(p), str_value(p.get('T')))


        def find_widgets(obj_list, widgets):
            for an in obj_list:
                try:
                    obj = resolve1(an)
                except PDFObjectNotFound:
                    print 'object not found'
                    print an

                widgets.append(create_widget(obj))
        
        find_widgets(self.annots, self.widgets)

        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
            contents = resolve1(self.attrs['Contents'])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [contents]
        self.contents = contents
        return