def _reindex_doc(self, docid, text): # Touch as few docid->w(docid, score) maps in ._wordinfo as possible. old_wids = self.get_words(docid) old_wid2w, old_docw = self._get_frequencies(old_wids) new_wids = self._lexicon.sourceToWordIds(text) new_wid2w, new_docw = self._get_frequencies(new_wids) old_widset = IITreeSet(old_wid2w.keys()) new_widset = IITreeSet(new_wid2w.keys()) in_both_widset = intersection(old_widset, new_widset) only_old_widset = difference(old_widset, in_both_widset) only_new_widset = difference(new_widset, in_both_widset) del old_widset, new_widset for wid in only_old_widset.keys(): self._del_wordinfo(wid, docid) for wid in only_new_widset.keys(): self._add_wordinfo(wid, new_wid2w[wid], docid) for wid in in_both_widset.keys(): # For the Okapi indexer, the "if" will trigger only for words # whose counts have changed. For the cosine indexer, the "if" # may trigger for every wid, since W(d) probably changed and # W(d) is divided into every score. newscore = new_wid2w[wid] if old_wid2w[wid] != newscore: self._add_wordinfo(wid, newscore, docid) self._docweight[docid] = new_docw self._docwords[docid] = WidCode.encode(new_wids) return len(new_wids)
def index_doc(self, docid, text): if self._docwords.has_key(docid): return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) wid2weight, docweight = self._get_frequencies(wids) self._mass_add_wordinfo(wid2weight, docid) self._docweight[docid] = docweight self._docwords[docid] = WidCode.encode(wids) return len(wids)
def index_doc(self, docid, text): if self._docwords.has_key(docid): return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) wid2weight, docweight = self._get_frequencies(wids) self._mass_add_wordinfo(wid2weight, docid) self._docweight[docid] = docweight self._docwords[docid] = WidCode.encode(wids) try: self.document_count.change(1) except AttributeError: # Upgrade document_count to Length object self.document_count = Length(self.document_count()) return len(wids)
def index_doc(self, docid, text): if docid in self._docwords: return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) wid2weight, docweight = self._get_frequencies(wids) self._mass_add_wordinfo(wid2weight, docid) self._docweight[docid] = docweight self._docwords[docid] = WidCode.encode(wids) try: self.document_count.change(1) except AttributeError: # Upgrade document_count to Length object self.document_count = Length(self.document_count()) return len(wids)
def search_phrase(self, phrase): wids = self._lexicon.termToWordIds(phrase) cleaned_wids = self._remove_oov_wids(wids) if len(wids) != len(cleaned_wids): # At least one wid was OOV: can't possibly find it. return IIBTree() scores = self._search_wids(wids) hits = mass_weightedIntersection(scores) if not hits: return hits code = WidCode.encode(wids) result = IIBTree() for docid, weight in hits.items(): docwords = self._docwords[docid] if docwords.find(code) >= 0: result[docid] = weight return result