class Lexicon(Persistent): """ Implementation of :class:`zope.index.text.interfaces.ILexicon`. """ def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self.wordCount = Length() self._pipeline = pipeline def wordCount(self): """Return the number of unique terms in the lexicon.""" # overridden per instance return len(self._wids) def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): if text is None: text = '' last = _text2list(text) for element in self._pipeline: last = element.process(last) if not isinstance(self.wordCount, Length): # Make sure wordCount is overridden with a BTrees.Length.Length self.wordCount = Length(self.wordCount()) # Strategically unload the length value so that we get the most # recent value written to the database to minimize conflicting wids # Because length is independent, this will load the most # recent value stored, regardless of whether MVCC is enabled self.wordCount._p_deactivate() return list(map(self._getWordIdCreate, last)) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: wid = self._new_wid() self._wids[word] = wid self._words[wid] = word return wid def _new_wid(self): count = self.wordCount count.change(1) while count() in self._words: # just to be safe count.change(1) return count()
class UUIDIndex(UnIndex): """Index for uuid fields with an unique value per key. The internal structure is: self._index = {datum:documentId]} self._unindex = {documentId:datum} For each datum only one documentId can exist. """ meta_type = "UUIDIndex" manage_options = ( { 'label': 'Settings', 'action': 'manage_main' }, { 'label': 'Browse', 'action': 'manage_browse' }, ) query_options = ["query", "range"] manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals()) manage_main._setName('manage_main') manage_browse = DTMLFile('../dtml/browseIndex', globals()) def clear(self): self._length = Length() self._index = OIBTree() self._unindex = IOBTree() def numObjects(self): """Return the number of indexed objects. Since we have a 1:1 mapping from documents to values, we can reuse the stored length. """ return self.indexSize() def uniqueValues(self, name=None, withLengths=0): """returns the unique values for name if withLengths is true, returns a sequence of tuples of (value, length) """ if name is None: name = self.id elif name != self.id: return [] if not withLengths: return tuple(self._index.keys()) # We know the length for each value is one return [(k, 1) for k in self._index.keys()] def insertForwardIndexEntry(self, entry, documentId): """Take the entry provided and put it in the correct place in the forward index. """ if entry is None: return old_docid = self._index.get(entry, _marker) if old_docid is _marker: self._index[entry] = documentId self._length.change(1) elif old_docid != documentId: logger.error("A different document with value '%s' already " "exists in the index.'" % entry) def removeForwardIndexEntry(self, entry, documentId): """Take the entry provided and remove any reference to documentId in its entry in the index. """ old_docid = self._index.get(entry, _marker) if old_docid is not _marker: del self._index[entry] self._length.change(-1) def _get_object_datum(self, obj, attr): # for a uuid it never makes sense to acquire a parent value via # Acquisition has_attr = getattr(aq_base(obj), attr, _marker) if has_attr is _marker: return _marker return super(UUIDIndex, self)._get_object_datum(obj, attr)
class Indexer(object): filestorage = database = connection = root = None def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words") def dumpfreqs(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) L = [] for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f L.append((freq, wid, lexicon.get_word(wid))) L.sort() L.reverse() for freq, wid, word in L: print("%10d %10d %s" % (wid, freq, word)) def dumpwids(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid))) def dumpwords(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for word in lexicon.words(): wid = lexicon.get_wid(word) freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, word)) def close(self): self.root = None if self.connection is not None: self.connection.close() self.connection = None if self.database is not None: self.database.close() self.database = None if self.filestorage is not None: self.filestorage.close() self.filestorage = None def interact(self, nbest=NBEST, maxlines=MAXLINES): try: import readline except ImportError: pass text = "" top = 0 results = [] while 1: try: line = raw_input("Query: ") except EOFError: print("\nBye.") break line = line.strip() if line.startswith("/"): self.specialcommand(line, results, top - nbest) continue if line: text = line top = 0 else: if not text: continue try: results, n = self.timequery(text, top + nbest) except KeyboardInterrupt: raise except: reportexc() text = "" continue if len(results) <= top: if not n: print("No hits for %r." % text) else: print("No more hits for %r." % text) text = "" continue print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n), end=" ") print("for query %s]" % repr(text)) self.formatresults(text, results, maxlines, top, top+nbest) top += nbest def specialcommand(self, line, results, first): assert line.startswith("/") line = line[1:] if not line: n = first else: try: n = int(line) - 1 except: print("Huh?") return if n < 0 or n >= len(results): print("Out of range") return docid, score = results[n] path = self.docpaths[docid] i = path.rfind("/") assert i > 0 folder = path[:i] n = path[i+1:] cmd = "show +%s %s" % (folder, n) if os.getenv("DISPLAY"): os.system("xterm -e sh -c '%s | less' &" % cmd) else: os.system(cmd) def query(self, text, nbest=NBEST, maxlines=MAXLINES): results, n = self.timequery(text, nbest) if not n: print("No hits for %r." % text) return print("[Results 1-%d from %d]" % (len(results), n)) self.formatresults(text, results, maxlines) def timequery(self, text, nbest): t0 = time.time() c0 = time.clock() results, n = self.index.query(text, 0, nbest) t1 = time.time() c1 = time.clock() print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)) return results, n def formatresults(self, text, results, maxlines=MAXLINES, lo=0, hi=sys.maxint): stop = self.stopdict.has_key words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)] pattern = r"\b(" + "|".join(words) + r")\b" pattern = pattern.replace("*", ".*") # glob -> re syntax prog = re.compile(pattern, re.IGNORECASE) print('='*70) rank = lo for docid, score in results[lo:hi]: rank += 1 path = self.docpaths[docid] score *= 100.0 print("Rank: %d Score: %d%% File: %s" % (rank, score, path)) path = os.path.join(self.mh.getpath(), path) try: fp = open(path) except (IOError, OSError) as msg: print("Can't open:", msg) continue msg = mhlib.Message("<folder>", 0, fp) for header in "From", "To", "Cc", "Bcc", "Subject", "Date": h = msg.getheader(header) if h: print("%-8s %s" % (header+":", h)) text = self.getmessagetext(msg) if text: print() nleft = maxlines for part in text: for line in part.splitlines(): if prog.search(line): print(line) nleft -= 1 if nleft <= 0: break if nleft <= 0: break print('-'*70) def update(self, args): folder = None seqs = [] for arg in args: if arg.startswith("+"): if folder is None: folder = arg[1:] else: print("only one folder at a time") return else: seqs.append(arg) if not folder: folder = self.mh.getcontext() if not seqs: seqs = ['all'] try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) return dict = {} for seq in seqs: try: nums = f.parsesequence(seq) except mhlib.Error as msg: print(msg or "unparsable message sequence: %s" % repr(seq)) return for n in nums: dict[n] = n msgs = dict.keys() msgs.sort() self.updatefolder(f, msgs) self.commit() def optimize(self, args): uniqwords = {} for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nOPTIMIZE FOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.prescan(f, f.listmessages(), uniqwords) L = [(uniqwords[word], word) for word in uniqwords.keys()] L.sort() L.reverse() for i in range(100): print("%3d. %6d %s" % ((i+1,) + L[i])) self.index.lexicon.sourceToWordIds([word for (count, word) in L]) def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print("prescanning", n) m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1 def bulkupdate(self, args): if not args: print("No folders specified; use ALL to bulk-index all folders") return if "ALL" in args: i = args.index("ALL") args[i:i+1] = self.mh.listfolders() for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nFOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.updatefolder(f, f.listmessages()) print("Total", len(self.docpaths)) self.commit() print("Indexed", self.index.lexicon._nbytes, "bytes and",) print(self.index.lexicon._nwords, "words;",) print(len(self.index.lexicon._words), "unique words.") def updatefolder(self, f, msgs): self.watchfolders[f.name] = self.getmtime(f.name) for n in msgs: path = "%s/%s" % (f.name, n) docid = self.path2docid.get(path, 0) if docid and self.getmtime(path) == self.doctimes.get(docid, 0): print("unchanged", docid, path) continue docid = self.newdocid(path) try: m = f.openmessage(n) except IOError: print("disappeared", docid, path) self.unindexpath(path) continue text = self.getmessagetext(m, f.name) if not text: self.unindexpath(path) continue print("indexing", docid, path) self.index.index_doc(docid, text) self.maycommit() # Remove messages from the folder that no longer exist for path in list(self.path2docid.keys(f.name)): if not path.startswith(f.name + "/"): break if self.getmtime(path) == 0: self.unindexpath(path) print("done.") def unindexpath(self, path): if self.path2docid.has_key(path): docid = self.path2docid[path] print("unindexing", docid, path) del self.docpaths[docid] del self.doctimes[docid] del self.path2docid[path] try: self.index.unindex_doc(docid) except KeyError as msg: print("KeyError", msg) self.maycommit() def getmessagetext(self, m, name=None): L = [] if name: L.append("_folder " + name) # To restrict search to a folder self.getheaders(m, L) try: self.getmsgparts(m, L, 0) except KeyboardInterrupt: raise except: print("(getmsgparts failed:)") reportexc() return L def getmsgparts(self, m, L, level): ctype = m.gettype() if level or ctype != "text/plain": print(". "*level + str(ctype)) if ctype == "text/plain": L.append(m.getbodytext()) elif ctype in ("multipart/alternative", "multipart/mixed"): for part in m.getbodyparts(): self.getmsgparts(part, L, level+1) elif ctype == "message/rfc822": f = StringIO(m.getbodytext()) m = mhlib.Message("<folder>", 0, f) self.getheaders(m, L) self.getmsgparts(m, L, level+1) def getheaders(self, m, L): H = [] for key in "from", "to", "cc", "bcc", "subject": value = m.get(key) if value: H.append(value) if H: L.append("\n".join(H)) def newdocid(self, path): docid = self.path2docid.get(path) if docid is not None: self.doctimes[docid] = self.getmtime(path) return docid docid = self.maxdocid + 1 self.maxdocid = docid self.docpaths[docid] = path self.doctimes[docid] = self.getmtime(path) self.path2docid[path] = docid return docid def getmtime(self, path): path = os.path.join(self.mh.getpath(), path) try: st = os.stat(path) except os.error as msg: return 0 return int(st[ST_MTIME]) def maycommit(self): self.trans_count += 1 if self.trans_count >= self.trans_limit > 0: self.commit() def commit(self): if self.trans_count > 0: print("committing...") transaction.commit() self.trans_count = 0 self.pack_count += 1 if self.pack_count >= self.pack_limit > 0: self.pack() def pack(self): if self.pack_count > 0: print("packing...") self.database.pack() self.pack_count = 0
class UUIDIndex(UnIndex): """Index for uuid fields with an unique value per key. The internal structure is: self._index = {datum:documentId]} self._unindex = {documentId:datum} For each datum only one documentId can exist. """ meta_type = "UUIDIndex" manage_options = ( {'label': 'Settings', 'action': 'manage_main'}, {'label': 'Browse', 'action': 'manage_browse'}, ) query_options = ["query", "range"] manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals()) manage_main._setName('manage_main') manage_browse = DTMLFile('../dtml/browseIndex', globals()) def clear(self): self._length = Length() self._index = OIBTree() self._unindex = IOBTree() self._counter = Length() def numObjects(self): """Return the number of indexed objects. Since we have a 1:1 mapping from documents to values, we can reuse the stored length. """ return self.indexSize() def uniqueValues(self, name=None, withLengths=0): """returns the unique values for name if withLengths is true, returns a sequence of tuples of (value, length) """ if name is None: name = self.id elif name != self.id: raise StopIteration if not withLengths: for key in self._index.keys(): yield key else: # We know the length for each value is one for key in self._index.keys(): yield (key, 1) def insertForwardIndexEntry(self, entry, documentId): """Take the entry provided and put it in the correct place in the forward index. """ if entry is None: return old_docid = self._index.get(entry, _marker) if old_docid is _marker: self._index[entry] = documentId self._length.change(1) elif old_docid != documentId: logger.error("A different document with value '%s' already " "exists in the index.'" % entry) def removeForwardIndexEntry(self, entry, documentId): """Take the entry provided and remove any reference to documentId in its entry in the index. """ old_docid = self._index.get(entry, _marker) if old_docid is not _marker: del self._index[entry] self._length.change(-1) def _get_object_datum(self, obj, attr): # for a uuid it never makes sense to acquire a parent value via # Acquisition has_attr = getattr(aq_base(obj), attr, _marker) if has_attr is _marker: return _marker return super(UUIDIndex, self)._get_object_datum(obj, attr)
class Lexicon(Persistent): implements(ILexicon) def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self._nextwid = 1 self._pipeline = pipeline # Keep some statistics about indexing self._nbytes = 0 # Number of bytes indexed (at start of pipeline) self._nwords = 0 # Number of words indexed (after pipeline) def wordCount(self): """Return the number of unique terms in the lexicon.""" return self._nextwid - 1 def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): last = _text2list(text) for t in last: self._nbytes += len(t) for element in self._pipeline: last = element.process(last) self._nwords += len(last) return map(self._getWordIdCreate, last) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: wid = self._new_wid() self._wids[word] = wid self._words[wid] = word return wid def _new_wid(self): wid = self._nextwid self._nextwid += 1 return wid
class Lexicon(Persistent): _v_nextid = None _wid_length_based = True # Flag to distinguish new and old lexica def __init__(self, *pipeline): self.clear() self._pipeline = pipeline def clear(self): """Empty the lexicon. """ self.length = Length() self._wid_length_based = False self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. def length(self): """Return the number of unique terms in the lexicon. """ # Overridden in instances with a BTrees.Length.Length raise NotImplementedError def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) return list(map(self._getWordIdCreate, last)) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "process_post_glob", element.process) last = process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: # WidCode requires us to use at least 0x4000 as a base number. # The algorithm in versions before 2.13 used the length as a base # number. So we don't even try to generate numbers below the # length as they are likely all taken minimum = 0x4000 if self._wid_length_based: minimum = max(self.length(), 0x4000) while True: if self._v_nextid is None: self._v_nextid = randrange(minimum, 0x10000000) wid = self._v_nextid self._v_nextid += 1 if wid not in self._words: break self._v_nextid = None self.length.change(1) self._wids[word] = wid self._words[wid] = word return wid
class ContentTypeScopeManager(BTreeScopeManager): """ A scope manager based on content types. This scope manager validates the request using the content type of the accessed object and the subpath of the request against a content type mapping. The content type mapping to be used will be one of specified by the resource access key, the client key or default, and is resolved in this order. One more restriction imposed by this scope manager: mappings are enforced absolutely for access keys. This allows clients to request new default scopes for themselves at will and/or have site-wide default scope changes without compromising the scopes already granted by the resource owner referenced by the access key. This however does not address the case where additional global restrictions that may be placed by the site owner as the focus is ultimately on the access keys. Workaround is to revoke those keys and have the content owners issue new ones regardless of changes. Pruning of unused scope is not implemented. """ zope.interface.implements(IContentTypeScopeManager) default_mapping_id = fieldproperty.FieldProperty( IContentTypeScopeManager['default_mapping_id']) def __init__(self): super(ContentTypeScopeManager, self).__init__() self._mappings = IOBTree() # Methods permitted to access this mapping with. Originally # I wanted to provide alternative sets of mapping on a per # mapping_id basis, however this proved to be complex and # complicated due to extra relationships involved. self._methods = IOBTree() # For metadata related to the above. self._mappings_metadata = IOBTree() # To ease the usage of scopes, the mappings are referenced by # names and are called profiles which add a few useful fields to # allow slightly easier usage. This separates the name from the # already active tokens such that once a token is instantiated # with a scope, the mapping is stuck until the token is revoked. self._named_mappings = OIBTree() # name to id. # To not overburden the named mappings with work-in-progress # profiles, instantiate one here also. self._edit_mappings = OOBTree() self.default_mapping_id = self.addMapping({}) # Main mapping related management methods. def addMapping(self, mapping, methods='GET HEAD OPTIONS', metadata=None): key = 0 # default? if len(self._mappings) > 0: # Can calculate the next key. key = self._mappings.maxKey() + 1 self._mappings[key] = mapping self._methods[key] = methods.split() if metadata is not None: self._mappings_metadata[key] = metadata return key def getMapping(self, mapping_id, default=_marker): result = self._mappings.get(mapping_id, default) if result is _marker: raise KeyError() return result def getMappingMetadata(self, mapping_id, default=None): result = self._mappings_metadata.get(mapping_id, default) return result def getMappingId(self, name): # Returned ID could potentially not exist, what do? return self._named_mappings[name] def getMappingMethods(self, mapping_id, default=_marker): result = self._methods.get(mapping_id, default) if result is _marker: raise KeyError() return result def checkMethodPermission(self, mapping_id, method): methods = self.getMappingMethods(mapping_id, ()) return method in methods def setMappingNameToId(self, name, mapping_id): self._named_mappings[name] = mapping_id def delMappingName(self, name): saved = self._named_mappings.pop(name, None) edits = self._edit_mappings.pop(name, None) return (saved, edits) def getMappingByName(self, name, default=_marker): try: mapping_id = self.getMappingId(name) mapping = self.getMapping(mapping_id) except KeyError: if default == _marker: raise mapping = default return mapping def getMappingNames(self): return self._named_mappings.keys() # Temporary/edited mapping profiles def getEditProfile(self, name, default=None): return self._edit_mappings.get(name, default) def setEditProfile(self, name, value): assert IContentTypeScopeProfile.providedBy(value) or value is None self._edit_mappings[name] = value def commitEditProfile(self, name): profile = self.getEditProfile(name) if not (IContentTypeScopeProfile.providedBy(profile)): raise KeyError('edit profile does not exist') new_mapping = profile.mapping methods = profile.methods metadata = { 'title': profile.title, 'description': profile.description, # Should really not duplicate this there but this is easy # shortcut to take for now. 'methods': methods, } new_id = self.addMapping(new_mapping, methods=methods, metadata=metadata) self.setMappingNameToId(name, new_id) def getEditProfileNames(self): return self._edit_mappings.keys() def isProfileModified(self, name): # TODO I would like some way to compare the two profiles in a # sane way but only using active types and types that have # stuff assigned. So for now just use this naive method. profile = self.getEditProfile(name) try: mapping_id = self.getMappingId(name) mapping = self.getMapping(mapping_id) metadata = self.getMappingMetadata(mapping_id, {}) except KeyError: # If profile exists, no associated ID, definitely modified. return True return not (profile.mapping == mapping and profile.title == metadata.get('title') and profile.description == metadata.get('description') and profile.methods == metadata.get('methods') ) # Scope handling. def requestScope(self, request_key, raw_scope): """ This manager references scope by ids internally. Resolve the raw scope id by the client into the mapping ids. """ raw_scopes = raw_scope and raw_scope.split(',') or [] result = set() for rs in raw_scopes: # Ignoring the current site URI and just capture the final # fragment. name = rs.split('/')[-1] try: mapping_id = self.getMappingId(name) # This verifies the existence of the mapping with id. mapping = self.getMapping(mapping_id) except KeyError: # Failed to fulfill the requested scope. return False result.add(mapping_id) if not result: result.add(self.default_mapping_id) self.setScope(request_key, result) return True def validate(self, request, client_key, access_key, accessed, container, name, value): """ See IScopeManager. """ mappings = self.resolveMapping(client_key, access_key) # multiple rights were requested, check through all of them. for mapping_id in mappings: mapping = self.getMapping(mapping_id, default={}) result = self.validateTargetWithMapping(accessed, name, mapping) method_allowed = self.checkMethodPermission(mapping_id, request.method) if result and method_allowed: return True # no matching mappings. return False def resolveMapping(self, client_key, access_key): """ See IDefaultScopeManager. """ # As all mappings are referenced byh access keys. return self.getAccessScope(access_key, None) def resolveTarget(self, accessed, name): """ Accessed target resolution. Find the type of the container object of the accessed object by traversing upwards, and gather the path to resolve into the content type id. Return both these values. """ logger.debug('resolving %s into types', accessed) # use getSite() instead of container? pt_tool = getToolByName(accessed, 'portal_types', None) if pt_tool is None: return None, None context = aq_inner(accessed) typeinfo = None subpath = [name] while context is not None: typeinfo = pt_tool.getTypeInfo(context) if typeinfo: subpath.reverse() return typeinfo.id, '/'.join(subpath) # It should have a name... subpath.append(context.__name__) context = aq_parent(context) logger.debug('parent of %s failed to resolve into typeinfo', accessed) return None, None def validateTargetWithMapping(self, accessed, name, mapping): atype, subpath = self.resolveTarget(accessed, name) return self.validateTypeSubpathMapping(atype, subpath, mapping) def validateTypeSubpathMapping(self, accessed_type, subpath, mapping): # A simple lookup method. valid_scopes = mapping.get(accessed_type, {}) if not valid_scopes: logger.debug('out of scope: %s has no mapping', accessed_type) return False logger.debug('%s got mapping', accessed_type) for vs in valid_scopes: # XXX ignores second last asterisk, preventing validation # against items that have an asterisk in its name for # whatever reason... if vs.endswith('*') and '/' in vs: match = subpath.startswith(vs[:vs.rindex('*')]) else: match = subpath == vs if match: logger.debug('subpath:%s within scope', subpath) return True logger.debug('out of scope: %s not a subpath in mapping for %s', subpath, accessed_type) return False
class Indexer(object): filestorage = database = connection = root = None def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words") def dumpfreqs(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) L = [] for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f L.append((freq, wid, lexicon.get_word(wid))) L.sort() L.reverse() for freq, wid, word in L: print("%10d %10d %s" % (wid, freq, word)) def dumpwids(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid))) def dumpwords(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for word in lexicon.words(): wid = lexicon.get_wid(word) freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, word)) def close(self): self.root = None if self.connection is not None: self.connection.close() self.connection = None if self.database is not None: self.database.close() self.database = None if self.filestorage is not None: self.filestorage.close() self.filestorage = None def interact(self, nbest=NBEST, maxlines=MAXLINES): try: import readline except ImportError: pass text = "" top = 0 results = [] while 1: try: line = raw_input("Query: ") except EOFError: print("\nBye.") break line = line.strip() if line.startswith("/"): self.specialcommand(line, results, top - nbest) continue if line: text = line top = 0 else: if not text: continue try: results, n = self.timequery(text, top + nbest) except KeyboardInterrupt: raise except: reportexc() text = "" continue if len(results) <= top: if not n: print("No hits for %r." % text) else: print("No more hits for %r." % text) text = "" continue print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n), end=' ') print("for query %s]" % repr(text)) self.formatresults(text, results, maxlines, top, top+nbest) top += nbest def specialcommand(self, line, results, first): assert line.startswith("/") line = line[1:] if not line: n = first else: try: n = int(line) - 1 except: print("Huh?") return if n < 0 or n >= len(results): print("Out of range") return docid, score = results[n] path = self.docpaths[docid] i = path.rfind("/") assert i > 0 folder = path[:i] n = path[i+1:] cmd = "show +%s %s" % (folder, n) if os.getenv("DISPLAY"): os.system("xterm -e sh -c '%s | less' &" % cmd) else: os.system(cmd) def query(self, text, nbest=NBEST, maxlines=MAXLINES): results, n = self.timequery(text, nbest) if not n: print("No hits for %r." % text) return print("[Results 1-%d from %d]" % (len(results), n)) self.formatresults(text, results, maxlines) def timequery(self, text, nbest): t0 = time.time() c0 = time.clock() results, n = self.index.query(text, 0, nbest) t1 = time.time() c1 = time.clock() print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)) return results, n def formatresults(self, text, results, maxlines=MAXLINES, lo=0, hi=sys.maxint): stop = self.stopdict.has_key words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)] pattern = r"\b(" + "|".join(words) + r")\b" pattern = pattern.replace("*", ".*") # glob -> re syntax prog = re.compile(pattern, re.IGNORECASE) print('='*70) rank = lo for docid, score in results[lo:hi]: rank += 1 path = self.docpaths[docid] score *= 100.0 print("Rank: %d Score: %d%% File: %s" % (rank, score, path)) path = os.path.join(self.mh.getpath(), path) try: fp = open(path) except (IOError, OSError) as msg: print("Can't open:", msg) continue msg = mhlib.Message("<folder>", 0, fp) for header in "From", "To", "Cc", "Bcc", "Subject", "Date": h = msg.getheader(header) if h: print("%-8s %s" % (header+":", h)) text = self.getmessagetext(msg) if text: print() nleft = maxlines for part in text: for line in part.splitlines(): if prog.search(line): print(line) nleft -= 1 if nleft <= 0: break if nleft <= 0: break print('-'*70) def update(self, args): folder = None seqs = [] for arg in args: if arg.startswith("+"): if folder is None: folder = arg[1:] else: print("only one folder at a time") return else: seqs.append(arg) if not folder: folder = self.mh.getcontext() if not seqs: seqs = ['all'] try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) return dict = {} for seq in seqs: try: nums = f.parsesequence(seq) except mhlib.Error as msg: print(msg or "unparsable message sequence: %s" % repr(seq)) return for n in nums: dict[n] = n msgs = dict.keys() msgs.sort() self.updatefolder(f, msgs) self.commit() def optimize(self, args): uniqwords = {} for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nOPTIMIZE FOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.prescan(f, f.listmessages(), uniqwords) L = [(uniqwords[word], word) for word in uniqwords.keys()] L.sort() L.reverse() for i in range(100): print("%3d. %6d %s" % ((i+1,) + L[i])) self.index.lexicon.sourceToWordIds([word for (count, word) in L]) def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print("prescanning", n) m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1 def bulkupdate(self, args): if not args: print("No folders specified; use ALL to bulk-index all folders") return if "ALL" in args: i = args.index("ALL") args[i:i+1] = self.mh.listfolders() for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nFOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.updatefolder(f, f.listmessages()) print("Total", len(self.docpaths)) self.commit() print("Indexed", self.index.lexicon._nbytes, "bytes and", end=' ') print(self.index.lexicon._nwords, "words;", end=' ') print(len(self.index.lexicon._words), "unique words.") def updatefolder(self, f, msgs): self.watchfolders[f.name] = self.getmtime(f.name) for n in msgs: path = "%s/%s" % (f.name, n) docid = self.path2docid.get(path, 0) if docid and self.getmtime(path) == self.doctimes.get(docid, 0): print("unchanged", docid, path) continue docid = self.newdocid(path) try: m = f.openmessage(n) except IOError: print("disappeared", docid, path) self.unindexpath(path) continue text = self.getmessagetext(m, f.name) if not text: self.unindexpath(path) continue print("indexing", docid, path) self.index.index_doc(docid, text) self.maycommit() # Remove messages from the folder that no longer exist for path in list(self.path2docid.keys(f.name)): if not path.startswith(f.name + "/"): break if self.getmtime(path) == 0: self.unindexpath(path) print("done.") def unindexpath(self, path): if path in self.path2docid: docid = self.path2docid[path] print("unindexing", docid, path) del self.docpaths[docid] del self.doctimes[docid] del self.path2docid[path] try: self.index.unindex_doc(docid) except KeyError as msg: print("KeyError", msg) self.maycommit() def getmessagetext(self, m, name=None): L = [] if name: L.append("_folder " + name) # To restrict search to a folder self.getheaders(m, L) try: self.getmsgparts(m, L, 0) except KeyboardInterrupt: raise except: print("(getmsgparts failed:)") reportexc() return L def getmsgparts(self, m, L, level): ctype = m.gettype() if level or ctype != "text/plain": print(". "*level + str(ctype)) if ctype == "text/plain": L.append(m.getbodytext()) elif ctype in ("multipart/alternative", "multipart/mixed"): for part in m.getbodyparts(): self.getmsgparts(part, L, level+1) elif ctype == "message/rfc822": f = StringIO(m.getbodytext()) m = mhlib.Message("<folder>", 0, f) self.getheaders(m, L) self.getmsgparts(m, L, level+1) def getheaders(self, m, L): H = [] for key in "from", "to", "cc", "bcc", "subject": value = m.get(key) if value: H.append(value) if H: L.append("\n".join(H)) def newdocid(self, path): docid = self.path2docid.get(path) if docid is not None: self.doctimes[docid] = self.getmtime(path) return docid docid = self.maxdocid + 1 self.maxdocid = docid self.docpaths[docid] = path self.doctimes[docid] = self.getmtime(path) self.path2docid[path] = docid return docid def getmtime(self, path): path = os.path.join(self.mh.getpath(), path) try: st = os.stat(path) except os.error as msg: return 0 return int(st[ST_MTIME]) def maycommit(self): self.trans_count += 1 if self.trans_count >= self.trans_limit > 0: self.commit() def commit(self): if self.trans_count > 0: print("committing...") transaction.commit() self.trans_count = 0 self.pack_count += 1 if self.pack_count >= self.pack_limit > 0: self.pack() def pack(self): if self.pack_count > 0: print("packing...") self.database.pack() self.pack_count = 0
class Lexicon(Persistent): """Maps words to word ids """ __implements__ = LexiconInterface def __init__(self, truncate_left=0): self.truncate_left = truncate_left self.clear() def clear(self): self._nextid = BTrees.Length.Length() self._forward_idx = OIBTree() self._inverse_idx = IOBTree() if self.truncate_left: self._lforward_idx = OIBTree() else: self._lforward_idx = None def getWordIdList(self, words): """ return a list of wordIds for a list of words """ fw_idx = self._forward_idx fw_idx_get = fw_idx.get rev_idx = self._inverse_idx if self.truncate_left: lfw_idx = self._lforward_idx nextid = self._nextid wids = [] append = wids.append for word in words: wid = fw_idx_get(word) if not wid: nextid.change(1) wid = nextid() fw_idx[word] = wid rev_idx[wid] = word if self.truncate_left: lfw_idx[word[::-1]] = wid append(wid) return wids def getWordId(self, word, default=None): """Return the matched word against the key.""" return self._forward_idx.get(word, default) def getWord(self, wid): """ return a word by its wid""" return self._inverse_idx[wid] def deleteWord(self, word): wid = self._forward_idx[word] del self._inverse_idx[wid] del self._forward_idx[word] def deleteWordId(self, wid): word = self._inverse_idx[wid] del self._forward_idx[word] del self._inverse_idx[wid] def getWordsForRightTruncation(self, prefix): """ Return a list for wordIds that match against prefix. We use the BTrees range search to perform the search """ assert isinstance(prefix, unicode) return self._forward_idx.keys(prefix, prefix + u'\uffff') def getWordsForLeftTruncation(self, suffix): """ Return a sequence of word ids for a common suffix """ suffix = suffix[::-1] assert isinstance(suffix, unicode) return [w[::-1] for w in self._lforward_idx.keys(suffix, suffix + u'\uffff') ] def createRegex(self, pattern): """Translate a PATTERN to a regular expression """ return '%s$' % pattern.replace( '*', '.*').replace( '?', '.') def getSimiliarWords(self, term, threshold=0.75): """ return a list of similar words based on the levenshtein distance """ return [ (w, ratio(w,term)) for w in self._forward_idx.keys() if ratio(w, term) > threshold ] def getWordsForPattern(self, pattern): """ perform full pattern matching """ # search for prefix in word mo = re.search('([\?\*])', pattern) if mo is None: return [ pattern ] pos = mo.start(1) if pos==0: raise QueryParserError, \ 'word "%s" should not start with a globbing character' % pattern prefix = pattern[:pos] words = self._forward_idx.keys(prefix, prefix + u'\uffff') regex = re.compile( self.createRegex(pattern) ) return [word for word in words if regex.match(word) ] def getWordsInRange(self, w1, w2): """ return all words within w1...w2 """ return self._forward_idx.keys(w1, w2) def getWordsForSubstring(self, sub): """ return all words that match *sub* """ return [word for word in self._forward_idx.keys() if sub in word] def getWordIds(self): """ return all wids """ return self._inverse_idx.keys() def removeWordId(self, wid): """ remove word id 'wid' """ word = self._inverse_idx[wid] del self._inverse_idx[wid] del self._forward_idx[word] def __len__(self): return len(self._inverse_idx.keys())