class Lexicon(Persistent): implements(ILexicon) def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self._nextwid = 1 self._pipeline = pipeline # Keep some statistics about indexing self._nbytes = 0 # Number of bytes indexed (at start of pipeline) self._nwords = 0 # Number of words indexed (after pipeline) def wordCount(self): """Return the number of unique terms in the lexicon.""" return self._nextwid - 1 def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): last = _text2list(text) for t in last: self._nbytes += len(t) for element in self._pipeline: last = element.process(last) self._nwords += len(last) return map(self._getWordIdCreate, last) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: wid = self._new_wid() self._wids[word] = wid self._words[wid] = word return wid def _new_wid(self): wid = self._nextwid self._nextwid += 1 return wid
class Lexicon(Persistent): """ Implementation of :class:`zope.index.text.interfaces.ILexicon`. """ def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self.wordCount = Length() self._pipeline = pipeline def wordCount(self): """Return the number of unique terms in the lexicon.""" # overridden per instance return len(self._wids) def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): if text is None: text = '' last = _text2list(text) for element in self._pipeline: last = element.process(last) if not isinstance(self.wordCount, Length): # Make sure wordCount is overridden with a BTrees.Length.Length self.wordCount = Length(self.wordCount()) # Strategically unload the length value so that we get the most # recent value written to the database to minimize conflicting wids # Because length is independent, this will load the most # recent value stored, regardless of whether MVCC is enabled self.wordCount._p_deactivate() return list(map(self._getWordIdCreate, last)) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: wid = self._new_wid() self._wids[word] = wid self._words[wid] = word return wid def _new_wid(self): count = self.wordCount count.change(1) while count() in self._words: # just to be safe count.change(1) return count()
def index_object(self, documentId, obj, threshold=None): """ Index an object: 'documentId' is the integer id of the document 'obj' is the object to be indexed 'threshold' is the number of words to process between commiting subtransactions. If 'None' subtransactions are disabled. """ # sniff the object for our 'id', the 'document source' of the # index is this attribute. If it smells callable, call it. try: source = getattr(obj, self.id) if safe_callable(source): source = source() if not isinstance(source, UnicodeType): source = str(source) except (AttributeError, TypeError): return 0 # sniff the object for 'id'+'_encoding' try: encoding = getattr(obj, self.id+'_encoding') if safe_callable(encoding ): encoding = str(encoding()) else: encoding = str(encoding) except (AttributeError, TypeError): encoding = 'latin1' lexicon = self.getLexicon() splitter = lexicon.Splitter wordScores = OIBTree() last = None # Run through the words and score them for word in list(splitter(source,encoding=encoding)): if word[0] == '\"': last = self._subindex(word[1:-1], wordScores, last, splitter) else: if word==last: continue last=word wordScores[word]=wordScores.get(word,0)+1 # Convert scores to use wids: widScores=IIBucket() getWid=lexicon.getWordId for word, score in wordScores.items(): widScores[getWid(word)]=score del wordScores currentWids=IISet(self._unindex.get(documentId, [])) # Get rid of document words that are no longer indexed self.unindex_objectWids(documentId, difference(currentWids, widScores)) # Now index the words. Note that the new xIBTrees are clever # enough to do nothing when there isn't a change. Woo hoo. insert=self.insertForwardIndexEntry for wid, score in widScores.items(): insert(wid, documentId, score) # Save the unindexing info if it's changed: wids=widScores.keys() if wids != currentWids.keys(): self._unindex[documentId]=wids return len(wids)
class Lexicon(Persistent, Implicit): """Maps words to word ids and then some The Lexicon object is an attempt to abstract vocabularies out of Text indexes. This abstraction is not totally cooked yet, this module still includes the parser for the 'Text Index Query Language' and a few other hacks. """ # default for older objects stop_syn={} def __init__(self, stop_syn=None,useSplitter=None,extra=None): self.clear() if stop_syn is None: self.stop_syn = {} else: self.stop_syn = stop_syn self.useSplitter = Splitter.splitterNames[0] if useSplitter: self.useSplitter=useSplitter self.splitterParams = extra self.SplitterFunc = Splitter.getSplitter(self.useSplitter) def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() def _convertBTrees(self, threshold=200): if (type(self._lexicon) is OIBTree and type(getattr(self, '_inverseLex', None)) is IOBTree): return from BTrees.convert import convert lexicon=self._lexicon self._lexicon=OIBTree() self._lexicon._p_jar=self._p_jar convert(lexicon, self._lexicon, threshold) try: inverseLex=self._inverseLex self._inverseLex=IOBTree() except AttributeError: # older lexicons didn't have an inverse lexicon self._inverseLex=IOBTree() inverseLex=self._inverseLex self._inverseLex._p_jar=self._p_jar convert(inverseLex, self._inverseLex, threshold) def set_stop_syn(self, stop_syn): """ pass in a mapping of stopwords and synonyms. Format is: {'word' : [syn1, syn2, ..., synx]} Vocabularies do not necesarily need to implement this if their splitters do not support stemming or stoping. """ self.stop_syn = stop_syn def getWordId(self, word): """ return the word id of 'word' """ wid=self._lexicon.get(word, None) if wid is None: wid=self.assignWordId(word) return wid set = getWordId def getWord(self, wid): """ post-2.3.1b2 method, will not work with unconverted lexicons """ return self._inverseLex.get(wid, None) def assignWordId(self, word): """Assigns a new word id to the provided word and returns it.""" # First make sure it's not already in there if self._lexicon.has_key(word): return self._lexicon[word] try: inverse=self._inverseLex except AttributeError: # woops, old lexicom wo wids inverse=self._inverseLex=IOBTree() for word, wid in self._lexicon.items(): inverse[wid]=word wid=randid() while not inverse.insert(wid, word): wid=randid() if isinstance(word,StringType): self._lexicon[intern(word)] = wid else: self._lexicon[word] = wid return wid def get(self, key, default=None): """Return the matched word against the key.""" r=IISet() wid=self._lexicon.get(key, default) if wid is not None: r.insert(wid) return r def __getitem__(self, key): return self.get(key) def __len__(self): return len(self._lexicon) def Splitter(self, astring, words=None, encoding = "latin1"): """ wrap the splitter """ if words is None: words = self.stop_syn try: return self.SplitterFunc( astring, words, encoding=encoding, singlechar=self.splitterParams.splitterSingleChars, indexnumbers=self.splitterParams.splitterIndexNumbers, casefolding=self.splitterParams.splitterCasefolding ) except: return self.SplitterFunc(astring, words) def query_hook(self, q): """ we don't want to modify the query cuz we're dumb """ return q
class Lexicon(Persistent): _v_nextid = None _wid_length_based = True # Flag to distinguish new and old lexica def __init__(self, *pipeline): self.clear() self._pipeline = pipeline def clear(self): """Empty the lexicon. """ self.length = Length() self._wid_length_based = False self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. def length(self): """Return the number of unique terms in the lexicon. """ # Overridden in instances with a BTrees.Length.Length raise NotImplementedError def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) return list(map(self._getWordIdCreate, last)) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "process_post_glob", element.process) last = process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: # WidCode requires us to use at least 0x4000 as a base number. # The algorithm in versions before 2.13 used the length as a base # number. So we don't even try to generate numbers below the # length as they are likely all taken minimum = 0x4000 if self._wid_length_based: minimum = max(self.length(), 0x4000) while True: if self._v_nextid is None: self._v_nextid = randrange(minimum, 0x10000000) wid = self._v_nextid self._v_nextid += 1 if wid not in self._words: break self._v_nextid = None self.length.change(1) self._wids[word] = wid self._words[wid] = word return wid
def index_object(self, documentId, obj, threshold=None): """ Index an object: 'documentId' is the integer id of the document 'obj' is the object to be indexed 'threshold' is the number of words to process between commiting subtransactions. If 'None' subtransactions are disabled. """ # sniff the object for our 'id', the 'document source' of the # index is this attribute. If it smells callable, call it. try: source = getattr(obj, self.id) if safe_callable(source): source = source() if not isinstance(source, UnicodeType): source = str(source) except (AttributeError, TypeError): return 0 # sniff the object for 'id'+'_encoding' try: encoding = getattr(obj, self.id + '_encoding') if safe_callable(encoding): encoding = str(encoding()) else: encoding = str(encoding) except (AttributeError, TypeError): encoding = 'latin1' lexicon = self.getLexicon() splitter = lexicon.Splitter wordScores = OIBTree() last = None # Run through the words and score them for word in list(splitter(source, encoding=encoding)): if word[0] == '\"': last = self._subindex(word[1:-1], wordScores, last, splitter) else: if word == last: continue last = word wordScores[word] = wordScores.get(word, 0) + 1 # Convert scores to use wids: widScores = IIBucket() getWid = lexicon.getWordId for word, score in wordScores.items(): widScores[getWid(word)] = score del wordScores currentWids = IISet(self._unindex.get(documentId, [])) # Get rid of document words that are no longer indexed self.unindex_objectWids(documentId, difference(currentWids, widScores)) # Now index the words. Note that the new xIBTrees are clever # enough to do nothing when there isn't a change. Woo hoo. insert = self.insertForwardIndexEntry for wid, score in widScores.items(): insert(wid, documentId, score) # Save the unindexing info if it's changed: wids = widScores.keys() if wids != currentWids.keys(): self._unindex[documentId] = wids return len(wids)