Example #1
0
    def index_doc(self, docid, text):
        if docid in self._docwords:
            return self._reindex_doc(docid, text)

        wids = self._lexicon.sourceToWordIds(text)

        # XXX Counter is slow. If it is an issue, need to include C module
        # http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character
        widcnt = Counter(wids)
        widset = widcnt.keys()
        widcode = PersistentWid.encode_wid(wids if self.keep_phrases else widset)
        self._docwords[docid] = widcode

        if widset:
            weights, lengths = self._get_doctrees(widset)
            docscores = self._get_widscores(widcnt, docid)
            parallel_traversal(*zip(*[(weights[w], docscores[w]) for w in widset]))
            prefetch(list(lengths.values()) + [self.documentCount])

            for w in widset:
                weights[w].add(docscores[w])
                lengths[w].change(1)

        self.documentCount.change(1)

        return len(wids)
Example #2
0
 def index_doc(self, docid, text):
     if docid in self._docwords:
         return self._reindex_doc(docid, text)
     wids = self._lexicon.sourceToWordIds(text)
     wid2weight, docweight = self._get_frequencies(wids)
     self._mass_add_wordinfo(wid2weight, docid)
     self._docweight[docid] = docweight
     self._docwords[docid] = PersistentWid.encode_wid(wids)
     try:
         self.documentCount.change(1)
     except AttributeError:
         # upgrade documentCount to Length object
         self.documentCount = Length.Length(len(self._docweight))
     count = len(wids)
     self._change_doc_len(count)
     return count
Example #3
0
 def index_doc(self, docid, text):
     if docid in self._docwords:
         return self._reindex_doc(docid, text)
     wids = self._lexicon.sourceToWordIds(text)
     wid2weight, docweight = self._get_frequencies(wids)
     self._mass_add_wordinfo(wid2weight, docid)
     self._docweight[docid] = docweight
     self._docwords[docid] = PersistentWid.encode_wid(wids)
     try:
         self.documentCount.change(1)
     except AttributeError:
         # upgrade documentCount to Length object
         self.documentCount = Length.Length(len(self._docweight))
     count = len(wids)
     self._change_doc_len(count)
     return count
Example #4
0
    def _reindex_doc(self, docid, text):
        # We should change Length only for new wids used
        old_wids = self.get_words(docid)
        old_ctr = Counter(old_wids)
        old_widset = set(old_ctr)
        new_wids = self._lexicon.sourceToWordIds(text)
        new_ctr = Counter(new_wids)
        new_widset = set(new_ctr)
        removed_wids = old_widset - new_widset
        added_wids = new_widset - old_widset
        all_wids = list(new_widset | old_widset)

        weights, lengths = self._get_doctrees(all_wids)

        for w in removed_wids:
            lengths[w].change(-1)
        for w in added_wids:
            lengths[w].change(1)

        old_docscores = self._get_widscores(old_ctr, docid)
        new_docscores = self._get_widscores(new_ctr, docid)
        parallel_traversal(*zip(*[
            (weights[w], old_docscores.get(w) or new_docscores.get(w))
            for w in all_wids]))
        # We should update all the weights if len(old_wids) != len(new_wids)
        # ...and that is generally the case, so we update always
        for w in old_widset:
            try:
                weights[w].remove(old_docscores[w])
            except KeyError:
                # This should never happen
                # If it does, it's a bad sign, though we should still work
                logging.error("Old weight-docid pair not found!")
        for w in new_widset:
            weights[w].add(new_docscores[w])

        self._docwords[docid] = PersistentWid.encode_wid(new_wids if self.keep_phrases else new_widset)

        return len(new_wids)
Example #5
0
    def _reindex_doc(self, docid, text):
        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
        self._change_doc_len(-self._docweight[docid])

        old_wids = self.get_words(docid)
        old_wid2w, old_docw = self._get_frequencies(old_wids)

        new_wids = self._lexicon.sourceToWordIds(text)
        new_wid2w, new_docw = self._get_frequencies(new_wids)

        old_widset = self.family.IF.TreeSet(old_wid2w.keys())
        new_widset = self.family.IF.TreeSet(new_wid2w.keys())

        IF = self.family.IF
        in_both_widset = IF.intersection(old_widset, new_widset)
        only_old_widset = IF.difference(old_widset, in_both_widset)
        only_new_widset = IF.difference(new_widset, in_both_widset)
        del old_widset, new_widset

        for wid in only_old_widset.keys():
            self._del_wordinfo(wid, docid)

        for wid in only_new_widset.keys():
            self._add_wordinfo(wid, new_wid2w[wid], docid)

        for wid in in_both_widset.keys():
            # For the Okapi indexer, the "if" will trigger only for words
            # whose counts have changed.  For the cosine indexer, the "if"
            # may trigger for every wid, since W(d) probably changed and
            # W(d) is divided into every score.
            newscore = new_wid2w[wid]
            if old_wid2w[wid] != newscore:
                self._add_wordinfo(wid, newscore, docid)

        self._docweight[docid] = new_docw
        self._docwords[docid] = PersistentWid.encode_wid(new_wids)
        return len(new_wids)
Example #6
0
    def _reindex_doc(self, docid, text):
        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
        self._change_doc_len(-self._docweight[docid])

        old_wids = self.get_words(docid)
        old_wid2w, old_docw = self._get_frequencies(old_wids)

        new_wids = self._lexicon.sourceToWordIds(text)
        new_wid2w, new_docw = self._get_frequencies(new_wids)

        old_widset = self.family.IF.TreeSet(old_wid2w.keys())
        new_widset = self.family.IF.TreeSet(new_wid2w.keys())

        IF = self.family.IF
        in_both_widset = IF.intersection(old_widset, new_widset)
        only_old_widset = IF.difference(old_widset, in_both_widset)
        only_new_widset = IF.difference(new_widset, in_both_widset)
        del old_widset, new_widset

        for wid in only_old_widset.keys():
            self._del_wordinfo(wid, docid)

        for wid in only_new_widset.keys():
            self._add_wordinfo(wid, new_wid2w[wid], docid)

        for wid in in_both_widset.keys():
            # For the Okapi indexer, the "if" will trigger only for words
            # whose counts have changed.  For the cosine indexer, the "if"
            # may trigger for every wid, since W(d) probably changed and
            # W(d) is divided into every score.
            newscore = new_wid2w[wid]
            if old_wid2w[wid] != newscore:
                self._add_wordinfo(wid, newscore, docid)

        self._docweight[docid] = new_docw
        self._docwords[docid] = PersistentWid.encode_wid(new_wids)
        return len(new_wids)