def index_doc(self, docid, text): if docid in self._docwords: return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) # XXX Counter is slow. If it is an issue, need to include C module # http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character widcnt = Counter(wids) widset = widcnt.keys() widcode = PersistentWid.encode_wid(wids if self.keep_phrases else widset) self._docwords[docid] = widcode if widset: weights, lengths = self._get_doctrees(widset) docscores = self._get_widscores(widcnt, docid) parallel_traversal(*zip(*[(weights[w], docscores[w]) for w in widset])) prefetch(list(lengths.values()) + [self.documentCount]) for w in widset: weights[w].add(docscores[w]) lengths[w].change(1) self.documentCount.change(1) return len(wids)
def index_doc(self, docid, text): if docid in self._docwords: return self._reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) wid2weight, docweight = self._get_frequencies(wids) self._mass_add_wordinfo(wid2weight, docid) self._docweight[docid] = docweight self._docwords[docid] = PersistentWid.encode_wid(wids) try: self.documentCount.change(1) except AttributeError: # upgrade documentCount to Length object self.documentCount = Length.Length(len(self._docweight)) count = len(wids) self._change_doc_len(count) return count
def _reindex_doc(self, docid, text): # We should change Length only for new wids used old_wids = self.get_words(docid) old_ctr = Counter(old_wids) old_widset = set(old_ctr) new_wids = self._lexicon.sourceToWordIds(text) new_ctr = Counter(new_wids) new_widset = set(new_ctr) removed_wids = old_widset - new_widset added_wids = new_widset - old_widset all_wids = list(new_widset | old_widset) weights, lengths = self._get_doctrees(all_wids) for w in removed_wids: lengths[w].change(-1) for w in added_wids: lengths[w].change(1) old_docscores = self._get_widscores(old_ctr, docid) new_docscores = self._get_widscores(new_ctr, docid) parallel_traversal(*zip(*[ (weights[w], old_docscores.get(w) or new_docscores.get(w)) for w in all_wids])) # We should update all the weights if len(old_wids) != len(new_wids) # ...and that is generally the case, so we update always for w in old_widset: try: weights[w].remove(old_docscores[w]) except KeyError: # This should never happen # If it does, it's a bad sign, though we should still work logging.error("Old weight-docid pair not found!") for w in new_widset: weights[w].add(new_docscores[w]) self._docwords[docid] = PersistentWid.encode_wid(new_wids if self.keep_phrases else new_widset) return len(new_wids)
def _reindex_doc(self, docid, text): # Touch as few docid->w(docid, score) maps in ._wordinfo as possible. self._change_doc_len(-self._docweight[docid]) old_wids = self.get_words(docid) old_wid2w, old_docw = self._get_frequencies(old_wids) new_wids = self._lexicon.sourceToWordIds(text) new_wid2w, new_docw = self._get_frequencies(new_wids) old_widset = self.family.IF.TreeSet(old_wid2w.keys()) new_widset = self.family.IF.TreeSet(new_wid2w.keys()) IF = self.family.IF in_both_widset = IF.intersection(old_widset, new_widset) only_old_widset = IF.difference(old_widset, in_both_widset) only_new_widset = IF.difference(new_widset, in_both_widset) del old_widset, new_widset for wid in only_old_widset.keys(): self._del_wordinfo(wid, docid) for wid in only_new_widset.keys(): self._add_wordinfo(wid, new_wid2w[wid], docid) for wid in in_both_widset.keys(): # For the Okapi indexer, the "if" will trigger only for words # whose counts have changed. For the cosine indexer, the "if" # may trigger for every wid, since W(d) probably changed and # W(d) is divided into every score. newscore = new_wid2w[wid] if old_wid2w[wid] != newscore: self._add_wordinfo(wid, newscore, docid) self._docweight[docid] = new_docw self._docwords[docid] = PersistentWid.encode_wid(new_wids) return len(new_wids)