def _mass_add_wordinfo(self, wid2weight, docid): dicttype = type({}) # self._wordinfo - IOBTree of docid -> weight trees get_doc2score = self._wordinfo.get new_word_count = 0 # Fill up cache for performance over the network wids = wid2weight.keys() parallel_traversal(self._wordinfo, wids) parallel_traversal(map(get_doc2score, wids), [docid] * len(wids)) from time import time for wid, weight in wid2weight.items(): doc2score = get_doc2score(wid) if doc2score is None: doc2score = {} new_word_count += 1 elif (isinstance(doc2score, dicttype) and len(doc2score) == self.DICT_CUTOFF): doc2score = self.family.IF.BTree(doc2score) doc2score[docid] = weight self._wordinfo[wid] = doc2score # not redundant: Persistency! try: self.wordCount.change(new_word_count) except AttributeError: # upgrade wordCount to Length object self.wordCount = Length(len(self._wordinfo))
def index_doc(self, docid, text): if docid in self._docwords: return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) # XXX Counter is slow. If it is an issue, need to include C module # http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character widcnt = Counter(wids) widset = widcnt.keys() widcode = PersistentWid.encode_wid(wids if self.keep_phrases else widset) self._docwords[docid] = widcode if widset: weights, lengths = self._get_doctrees(widset) docscores = self._get_widscores(widcnt, docid) parallel_traversal(*zip(*[(weights[w], docscores[w]) for w in widset])) prefetch(list(lengths.values()) + [self.documentCount]) for w in widset: weights[w].add(docscores[w]) lengths[w].change(1) self.documentCount.change(1) return len(wids)
def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) wids = [] if len(last) > 1: parallel_traversal(self._wids, last) for word in last: wids.append(self._wids.get(word, 0)) return wids
def _search_wids(self, wids): # Bulk-fetch all the info we want to use if len(wids) > 1: parallel_traversal(self._wordinfo, wids) prefetch_trees([self._wordinfo[wid] for wid in wids]) docids = list(set(itertools.chain(*[self._wordinfo[wid].keys() for wid in wids]))) if len(docids) > 1: parallel_traversal(self._docweight, docids) return super(OkapiIndex, self)._search_wids(wids)
def _search_wids(self, wids): # Bulk-fetch all the info we want to use if len(wids) > 1: parallel_traversal(self._wordinfo, wids) prefetch_trees([self._wordinfo[wid] for wid in wids]) docids = list(set(itertools.chain( *[self._wordinfo[wid].keys() for wid in wids]))) if len(docids) > 1: parallel_traversal(self._docweight, docids) return super(OkapiIndex, self)._search_wids(wids)
def sourceToWordIds(self, text): if text is None: text = '' last = _text2list(text) for element in self._pipeline: last = element.process(last) if not isinstance(self.wordCount, Length): # Make sure wordCount is overridden with a BTrees.Length.Length self.wordCount = Length(self.wordCount()) # Strategically unload the length value so that we get the most # recent value written to the database to minimize conflicting wids # Because length is independent, this will load the most # recent value stored, regardless of whether MVCC is enabled self.wordCount._p_deactivate() parallel_traversal(self._wids, last) return list(map(self._getWordIdCreate, last))
def unindex_doc(self, docid): if docid not in self._docwords: return wids = self.get_words(docid) ctr = Counter(wids) wids = list(ctr) weights, lengths = self._get_doctrees(wids) scores = self._get_widscores(ctr, docid) parallel_traversal(*zip(*[(weights[w], scores[w]) for w in wids])) for w in wids: lengths[w].change(-1) weights[w].remove(scores[w]) if lengths[w].value == 0: del self._wordinfo[w] del self._docwords[docid] self.documentCount.change(-1)
def _reindex_doc(self, docid, text): # We should change Length only for new wids used old_wids = self.get_words(docid) old_ctr = Counter(old_wids) old_widset = set(old_ctr) new_wids = self._lexicon.sourceToWordIds(text) new_ctr = Counter(new_wids) new_widset = set(new_ctr) removed_wids = old_widset - new_widset added_wids = new_widset - old_widset all_wids = list(new_widset | old_widset) weights, lengths = self._get_doctrees(all_wids) for w in removed_wids: lengths[w].change(-1) for w in added_wids: lengths[w].change(1) old_docscores = self._get_widscores(old_ctr, docid) new_docscores = self._get_widscores(new_ctr, docid) parallel_traversal(*zip(*[ (weights[w], old_docscores.get(w) or new_docscores.get(w)) for w in all_wids])) # We should update all the weights if len(old_wids) != len(new_wids) # ...and that is generally the case, so we update always for w in old_widset: try: weights[w].remove(old_docscores[w]) except KeyError: # This should never happen # If it does, it's a bad sign, though we should still work logging.error("Old weight-docid pair not found!") for w in new_widset: weights[w].add(new_docscores[w]) self._docwords[docid] = PersistentWid.encode_wid(new_wids if self.keep_phrases else new_widset) return len(new_wids)
def _get_doctrees(self, wids): """ Gets persistent objects used for indexes for wids returns: {wid -> TreeSet((weight, docid))}, {wid -> Length} """ weights = {} lengths = {} parallel_traversal(self._wordinfo, wids) for wid in wids: record = self._wordinfo.get(wid) if record is None: length = Length(0) wdocid = self.family.OO.TreeSet() self._wordinfo[wid] = (wdocid, length) self.wordCount.change(1) else: wdocid, length = record weights[wid] = wdocid lengths[wid] = length return weights, lengths
def _remove_oov_wids(self, wids): parallel_traversal(self._wordinfo, set(wids)) return filter(self._wordinfo.has_key, wids)