def init_resources(self, resources): self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1]).dic['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] for (k,v) in dict_value(resources).iteritems(): if 1 <= self.debug: print >>stderr, 'Resource: %r: %r' % (k,v) if k == 'Font': for (fontid,spec) in dict_value(v).iteritems(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrc.get_font(objid, spec) elif k == 'ColorSpace': for (csid,spec) in dict_value(v).iteritems(): self.csmap[csid] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': self.rsrc.get_procset(list_value(v)) elif k == 'XObject': for (xobjid,xobjstrm) in dict_value(v).iteritems(): self.xobjmap[xobjid] = xobjstrm return
def set_parser(self, parser): if self.parser: return self.parser = parser # The document is set to be temporarily ready during collecting # all the basic information about the document, e.g. # the header, the encryption information, and the access rights # for the document. self.ready = True # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. self.xrefs = parser.read_xref() for xref in self.xrefs: trailer = xref.trailer if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) if 'Root' in trailer: self.set_root(dict_value(trailer['Root'])) break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') # The document is set to be non-ready again, until all the # proper initialization (asking the password key and # verifying the access permission, so on) is finished. self.ready = False return
def set_parser(self, parser): if self.parser: return self.parser = parser # The document is set to be temporarily ready during collecting # all the basic information about the document, e.g. # the header, the encryption information, and the access rights # for the document. self.ready = True # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. self.xrefs = parser.read_xref() for xref in self.xrefs: trailer = xref.trailer if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) if 'Root' in trailer: self.set_root(dict_value(trailer['Root'])) break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') # The document is set to be non-ready again, until all the # proper initialization (asking the password key and # verifying the access permission, so on) is finished. self.ready = False return
def __validateOutline(self, outl, id): if outl.get("Next") != None: self.__validateOutline(dict_value(outl.get("Next")), outl.get("Next").objid) if outl.get("First") != None: self.__validateOutline(dict_value(outl.get("First")), outl.get("First").objid) if outl.get("A") != None: self.__validateAction(dict_value(outl.get("A")), "in outline " + str(id))
def validate(self, fileName): self.__loadDocument(fileName) self.__write("GLOBAL:") self.__validateDocumentCatalog(self.__doc.catalog) j = 0 for p in self.__doc.get_pages(): j += 1 self.__write("PAGE " + str(j) + ":", error=False) images = dict_value(p.resources.get("XObject")) for (k, v) in images.iteritems(): self.__validateXObjectDictionary(dict_value(v), literal_name(k)) gstates = dict_value(p.resources.get("ExtGState")) for (k, v) in gstates.iteritems(): self.__validateGraphicsStateParameterDictionary( dict_value(v), literal_name(k)) # TODO: V czy w ten sposob sprawdzimy wszystkie wzorce o typie 2? patterns = dict_value(p.resources.get("Pattern")) for (k, v) in patterns.iteritems(): self.__validatePattern(dict_value(v), literal_name(k)) i = -1 for a in list_value(p.annots): i += 1 self.__validateAnnotationDictionary( dict_value(a), str(i) + " on page " + p.pageid) # TODO: V powinno byc sprawdzane, czy font jest uzywany (p. 6.3.4) fonts = dict_value(p.resources.get("Font")) for (k, v) in fonts.iteritems(): self.__validateFont(dict_value(v), literal_name(k)) self.__interp.process_page(p)
def __validateDocumentCatalog(self, doc): # 6.6.2 if doc.get("AA") != None: self.__write("Document catalog contains AA entry") # 6.8.2.2 if doc.get("MarkInfo") == None: self.__write("Document catalog does not contain MarkInfo entry") else: if not dict_value(doc.get("MarkInfo")).get("Marked"): self.__write( "Marked flag in mark information dictionary is not set") # 6.8.4 if doc.get("Lang") == None: self.__write("Document catalog does not specify language") # 6.1.11 if doc.get("Names") != None: if dict_value(doc.get("Names")).get("EmbeddedFiles") != None: self.__write( "Document name dictionary contains EmbeddedFiles key") # 6.1.13 if doc.get("OCProperties") != None: self.__write("Document catalog contains OCProperties key") if doc.get("AcroForm") != None: i = -1 for f in list_value(dict_value(doc.get("AcroForm")).get("Fields")): i += 1 self.__validateField(dict_value(f), str(i) + " in AcroForm") if doc.get("Outlines") != None: self.__validateOutline( dict_value(dict_value(doc.get("Outlines")).get("First")), dict_value(doc.get("Outlines")).get("First").objid) if dict_value2(doc.get("OpenAction")) != None: self.__validateAction(dict_value(doc.get("OpenAction")), "OpenAction from document catalog")
def do_Do(self, xobjid): # the base of this function is basically copy-pasted from ancestor; unfortunately, I found no better solution xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) return if self.debug: logging.info("Processing xobj: %r" % xobj) subtype = xobj.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj: interpreter = self.dup() interpreter.is_first_level_call = None bbox = list_value(xobj["BBox"]) matrix = list_value(xobj.get("Matrix", MATRIX_IDENTITY)) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. resources = dict_value(xobj.get("Resources")) or self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) # for (k,v) in interpreter.text_lines.iteritems(): # self.text_sequences[k + self.keyword_count] = v self.keyword_count += interpreter.keyword_count print "Included %i keywords" % interpreter.keyword_count else: # ignored xobject type. pass return
def set_root(self, root): self.root = root self.catalog = dict_value(self.root) if self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: raise PDFSyntaxError('Catalog not found!') return
def set_root(self, root): self.root = root self.catalog = dict_value(self.root) if self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: raise PDFSyntaxError('Catalog not found!') return
def __validateXObjectDictionary(self, dict, id): if literal_name(dict.get("Subtype")) == "Form": # 6.2.5 if dict.get("Ref") != None: # 6.2.6 self.__write("XObject dictionary " + str(id) + " is a reference XObject") if dict.get("OPI") != None: self.__write("Form XObject dictionary " + str(id) + " contains OPI entry") # TODO: NOTE ale w reference 3 nie ma nic o Subtype2 i PS if dict.get("Subtype2") != None: if literal_name(dict.get("Subtype2")) == "PS": self.__write("Form XObject dictionary " + str(id) + " contains" + "Subtype2 entry with PS value") if dict.get("PS") != None: self.__write("Form XObject dictionary " + str(id) + " contains PS entry") if dict.get("Group") != None: # 6.4 groupDict = dict_value(dict.get("Group")) if literal_name(groupDict.get("S")) == "Transparency": self.__write( "Form XObject dictionary " + str(id) + "contains Group entry which S attribute value" + " id /Transparency") elif literal_name(dict.get("Subtype")) == "PS": # 6.2.7 self.__write("Document contains PostScript XObject " + str(id)) elif literal_name(dict.get("Subtype")) == "Image": self.__validateImageDictionary(dict, str(id))
def __init__(self, doc, pageid, attrs): """Initialize a page object. doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. """ self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get('LastModified')) self.resources = resolve1(self.attrs['Resources']) self.mediabox = resolve1(self.attrs['MediaBox']) if 'CropBox' in self.attrs: self.cropbox = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360 self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') if 'Contents' in self.attrs: contents = resolve1(self.attrs['Contents']) else: contents = [] if not isinstance(contents, list): contents = [contents] self.contents = contents self.number_of_pages = 0 return
def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj subtype = xobj.dic.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj.dic: interpreter = self.dup() bbox = list_value(xobj.dic['BBox']) matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)) self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) (w,h) = (xobj.dic['Width'], xobj.dic['Height']) self.device.render_image(xobj, (w,h)) self.device.end_figure(xobjid) else: # unsupported xobject type. pass return
def search(obj, parent): if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k, v) in parent.items(): if k in 'Resources' and k not in tree: tree[k] = v tree_type = tree.get('Type') if tree_type is LITERAL_PAGES and 'Kids' in tree: for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree_type is LITERAL_PAGE: yield (objid, tree)
def search(obj, parent): if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k, v) in parent.iteritems(): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if 1 <= debug: print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids'] for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: if 1 <= debug: print >>sys.stderr, 'Page: %r' % tree yield (objid, tree)
def load_trailer(self, parser): try: (_,kwd) = parser.nexttoken() assert kwd is self.KEYWORD_TRAILER (_,dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_,dic) = x[0] self.trailer.update( dict_value(dic)) return
def lookup_name(self, cat, key): try: names = dict_value(self.catalog['Names']) except (PDFTypeError, KeyError): raise KeyError((cat,key)) # may raise KeyError d0 = dict_value(names[cat]) def lookup(d): if 'Limits' in d: (k1,k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat,key)) return lookup(d0)
def load_trailer(self, parser): try: (_, kwd) = parser.nexttoken() assert kwd is self.KEYWORD_TRAILER (_, dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_, dic) = x[0] self.trailer.update(dict_value(dic)) return
def lookup(d): if 'Limits' in d: (k1, k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat, key))
def search(obj, parent): global pageno global result_pages if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k, v) in parent.iteritems(): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if klass.debug: logging.info('Pages: Kids=%r' % tree['Kids']) for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: if klass.debug: logging.info('Page: %r' % tree) pageno += 1 result_pages.append((objid, pageno)) yield (objid, tree, pageno)
def lookup(d): if 'Limits' in d: (k1,k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat,key))
def lookup_name(self, cat, key): try: names = dict_value(self.catalog['Names']) except (PDFTypeError, KeyError): raise KeyError((cat, key)) # may raise KeyError d0 = dict_value(names[cat]) def lookup(d): if 'Limits' in d: (k1, k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat, key)) return lookup(d0)
class PDFCIDFont(PDFFont): def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get( 'Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) try: name = literal_name(spec['Encoding']) except KeyError: if STRICT: raise PDFFontError('Encoding is unspecified') name = 'unknown' try: self.cmap = rsrc.get_cmap(name, strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e) try: descriptor = dict_value(spec['FontDescriptor']) except KeyError: if STRICT: raise PDFFontError('FontDescriptor is missing') descriptor = {} ttf = None if 'FontFile2' in descriptor: self.fontfile = stream_value(descriptor.get('FontFile2')) ttf = TrueTypeFont(self.basefont, StringIO(self.fontfile.get_data())) self.ucs2_cmap = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() elif self.cidcoding == 'Adobe-Identity': if ttf: try: self.ucs2_cmap = ttf.create_cmap() except TrueTypeFont.CMapNotFound: pass else: try: self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding, strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e)
def search(obj, parent): tree = dict_value(obj).copy() for (k, v) in parent.iteritems(): if k in self.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if 1 <= self.debug: print >> stderr, 'Pages: Kids=%r' % tree['Kids'] for c in tree['Kids']: for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: if 1 <= self.debug: print >> stderr, 'Page: %r' % tree yield (obj.objid, tree)
def search(obj, parent): tree = dict_value(obj).copy() for (k,v) in parent.iteritems(): if k in self.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if 1 <= self.debug: print >>stderr, 'Pages: Kids=%r' % tree['Kids'] for c in tree['Kids']: for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: if 1 <= self.debug: print >>stderr, 'Page: %r' % tree yield (obj.objid, tree)
def init_process_pdf(fp, password=''): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Extraction is not allowed: %r' % fp) return (doc, num_value(dict_value(doc.catalog.get("Pages")).get("Count")))
def search(entry, level): entry = dict_value(entry) if 'Title' in entry: if 'A' in entry or 'Dest' in entry: title = decode_text(str_value(entry['Title'])) dest = entry.get('Dest') action = entry.get('A') se = entry.get('SE') yield (level, title, dest, action, se) if 'First' in entry and 'Last' in entry: for x in search(entry['First'], level + 1): yield x if 'Next' in entry: for x in search(entry['Next'], level): yield x return
def __init__(self, rsrc, spec): firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 0)) widths = list_value(spec.get('Widths', [0]*256)) widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths)) if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: descriptor = {'FontName':spec.get('Name'), 'Ascent':0, 'Descent':0, 'FontBBox':spec['FontBBox']} PDFSimpleFont.__init__(self, descriptor, widths, spec) self.matrix = tuple(list_value(spec.get('FontMatrix'))) (_,self.descent,_,self.ascent) = self.bbox (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1)) return
def search(entry, level): entry = dict_value(entry) if 'Title' in entry: if 'A' in entry or 'Dest' in entry: title = decode_text(str_value(entry['Title'])) dest = entry.get('Dest') action = entry.get('A') se = entry.get('SE') yield (level, title, dest, action, se) if 'First' in entry and 'Last' in entry: for x in search(entry['First'], level+1): yield x if 'Next' in entry: for x in search(entry['Next'], level): yield x return
def get_font(self, objid, spec): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: if settings.STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') # Create a Font object. if 'Subtype' in spec: subtype = literal_name(spec['Subtype']) else: if settings.STRICT: raise PDFFontError('Font Subtype is not specified.') subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font - Ensure recursive object references have been resolved if type(spec['CIDSystemInfo']) is not PDFObjRef: for k in spec['CIDSystemInfo']: if type(spec['CIDSystemInfo'][k]) is PDFObjRef: spec['CIDSystemInfo'][k] = spec['CIDSystemInfo'][ k].resolve() font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if settings.STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) if objid and self.caching: self._cached_fonts[objid] = font return font
def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: descriptor = dict_value(spec.get('FontDescriptor', {})) firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 255)) widths = list_value(spec.get('Widths', [0]*256)) widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) PDFSimpleFont.__init__(self, descriptor, widths, spec) return
def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: descriptor = dict_value(spec.get('FontDescriptor', {})) firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 255)) widths = list_value(spec.get('Widths', [0] * 256)) widths = dict((i + firstchar, w) for (i, w) in enumerate(widths)) PDFSimpleFont.__init__(self, descriptor, widths, spec) return
def __init__(self, rsrc, spec): firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 0)) widths = list_value(spec.get('Widths', [0] * 256)) widths = dict((i + firstchar, w) for (i, w) in enumerate(widths)) if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: descriptor = { 'FontName': spec.get('Name'), 'Ascent': 0, 'Descent': 0, 'FontBBox': spec['FontBBox'] } PDFSimpleFont.__init__(self, descriptor, widths, spec) self.matrix = tuple(list_value(spec.get('FontMatrix'))) (_, self.descent, _, self.ascent) = self.bbox (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) return
def get_font(self, objid, spec): if objid and objid in self.fonts: font = self.fonts[objid] else: if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') # Create a Font object. if 'Subtype' in spec: subtype = literal_name(spec['Subtype']) else: if STRICT: raise PDFFontError('Font Subtype is not specified.') subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) # this is so wrong! if objid: self.fonts[objid] = font return font
def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) try: name = literal_name(spec['Encoding']) except KeyError: if STRICT: raise PDFFontError('Encoding is unspecified') name = 'unknown' try: self.cmap = rsrc.get_cmap(name, strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e)
def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get( 'Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) try: name = literal_name(spec['Encoding']) except KeyError: if STRICT: raise PDFFontError('Encoding is unspecified') name = 'unknown' try: self.cmap = rsrc.get_cmap(name, strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e)
def __init__(self, doc, pageid, attrs): self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get('LastModified')) self.resources = resolve1(self.attrs.get('Resources', dict())) self.mediabox = resolve1(self.attrs['MediaBox']) if 'CropBox' in self.attrs: self.cropbox = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox self.rotate = (int_value(self.attrs.get('Rotate', 0)) + 360) % 360 self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') if 'Contents' in self.attrs: contents = resolve1(self.attrs['Contents']) else: contents = [] if not isinstance(contents, list): contents = [contents] self.contents = contents return
def __validateAnnotationDictionary(self, dict, id): # 6.5 if not literal_name(dict.get("Subtype")) in [ "Text", "Link", "FreeText", "Line", "Square", "Circle", "Highlight", "Underline", "Squiggly", "StrikeOut", "Stamp", "Ink", "Popup", "Widget", "PrinterMark", "TrapNet" ]: self.__write("Annotation dictionary " + str(id) + "contains invalid" + " Subtype entry") if dict.get("CA") != None: if num_value(dict.get("CA")) != 1.0: self.__write("Annotation dictionary " + str(id) + " contains CA " + "entry which value isn't 1.0") if dict.get("F") != None: self.__write("Annotation dictionary " + str(id) + " contains F entry") if literal_name(dict.get("Subtype")) == "Widget": self.__validateWidgetAnnotation(dict, id) else: if dict.get("A") != None: self.__validateAction(dict_value(dict.get("A")), "in annotation " + str(id))
def __init__(self, doc, pageid, attrs): self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get('LastModified')) self.resources = resolve1(self.attrs['Resources']) self.mediabox = resolve1(self.attrs['MediaBox']) if 'CropBox' in self.attrs: self.cropbox = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox self.rotate = self.attrs.get('Rotate', 0) self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') if 'Contents' in self.attrs: contents = resolve1(self.attrs['Contents']) else: contents = [] if not isinstance(contents, list): contents = [ contents ] self.contents = contents return
def __validateField(self, field, id): # 6.6.1, 6.6.2, 6.9 if field.get("AA") != None: self.__write("Field dictionary " + str(id) + " contains AA entry") for f in list_value(field.get("Kids")): self.__validateField(dict_value(f), f.objid)
def __validatePattern(self, dict, id): if num_value(dict.get("PatternType")) == 2: self.__validateGraphicsStateParameterDictionary( dict_value(dict.get("ExtGState")), "in " + str(id) + " pattern")
def do_keyword(self, pos, token): if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) return if token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4)) return if token is self.KEYWORD_R: # reference to indirect object try: ((_,objid), (_,genno)) = self.pop(2) (objid, genno) = (int(objid), int(genno)) obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: pass return if token is self.KEYWORD_STREAM: # stream object ((_,dic),) = self.pop(1) dic = dict_value(dic) try: objlen = int_value(dic['Length']) except KeyError: if STRICT: raise PDFSyntaxError('/Length is undefined: %r' % dic) objlen = 0 self.seek(pos) try: (_, line) = self.nextline() # 'stream' except PSEOF: if STRICT: raise PDFSyntaxError('Unexpected EOF') return pos += len(line) self.fp.seek(pos) data = self.fp.read(objlen) self.seek(pos+objlen) while 1: try: (linepos, line) = self.nextline() except PSEOF: if STRICT: raise PDFSyntaxError('Unexpected EOF') break if 'endstream' in line: i = line.index('endstream') objlen += i data += line[:i] break objlen += len(line) data += line self.seek(pos+objlen) if 1 <= self.debug: print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10]) obj = PDFStream(dic, data, self.doc.decipher) self.push((pos, obj)) return # others self.push((pos, token)) return
def __initializePTree(self, doc): self.__ptree.label = "Document" i = 1 for p in doc.get_pages(): child = PTree() child.label = "Page " + str(i) self.__pagenos.setdefault(i, p.pageid) i += 1 child.data = p.pageid self.__ptree.children.append(child) child.parent = self.__ptree fonts = dict_value(p.resources.get("Font")) images = dict_value(p.resources.get("XObject")) #print images for (fontid, spec) in fonts.iteritems(): # TODO: I czy tu zawsze bedzie referencja? objid = spec.objid spec = dict_value(spec) child2 = PTree() child2.label = "Font " + str(fontid) child2.data = Font.new(spec, None, p.pageid, child2, gui=self.__gui, map=self.__map) #print spec assert (child2.data.name != None) child.children.append(child2) child2.parent = child maskMap = {} masks = [] def __isMask(spec): spec = stream_value(spec) if spec.get("ImageMask") == None: return False else: #print "else", num_value(spec.get("Mask")) return num_value(spec.get("ImageMask")) == 1 def __hasMask(spec): if stream_value(spec).get("Mask") == None: #print "false" return False elif stream_value2(stream_value(spec).get("Mask")) != None: #print "true" # TODO: NOTE pdfminer nie obsluguje genno maskMap.setdefault( stream_value(spec).get("Mask").objid, spec.objid) #print stream_value(spec).get("Mask").objid, spec.objid else: #print "else" return False for (objname, spec) in images.iteritems(): #print spec # TODO: I czy tu zawsze bedzie referencja? objid = spec.objid isMask = False if __isMask(spec): isMask = True spec = stream_value(spec) __hasMask(spec) if literal_name(spec.get("Subtype")) == "Image": #print objid child2 = PTree() child2.label = "Image " + str(objname) child2.data = (spec, i - 1, objid, 0) child.children.append( child2) # TODO: NOTE pdfminer nie wspiera genno child2.parent = child if isMask: masks.append(child2) for mask in masks: (a, b, c, d) = mask.data objid = maskMap.get(c) if objid != None: #print c, objid mask.data = (a, b, objid, d)
def getValue(self, props, key): try: return props.get(key) # slownik w contencie except AttributeError: # slownik w resource'ach dict = self.resources.get("Properties").get(literal_name(props)) return dict_value(dict).get(key)