Example #1
0
    def _python_search_wids(self, wids):
        if not wids:
            return []
        N = float(self.documentCount())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = self.family.IF.Bucket()
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = tf * idf
            L.append((result, 1))
        return L
Example #2
0
    def _c_search_wids(self, wids):
        if not wids:
            return []
        N = float(self.documentCount())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        #K1 = self.K1
        #B = self.B
        #K1_plus1 = K1 + 1.0
        #B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = self.family.IF.Bucket()
            items = d2f.items() if PY2 else list(d2f.items())
            score(result, items, docid2len, idf, meandoclen)
            L.append((result, 1))
        return L
Example #3
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(len(self._docweight))  # total # of docs
        meandoclen = self._totaldoclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IFBucket()
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = tf * idf
            L.append((result, 1))
        return L
Example #4
0
    def _c_search_wids(self, wids):
        if not wids:
            return []
        N = float(self.documentCount())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        #K1 = self.K1
        #B = self.B
        #K1_plus1 = K1 + 1.0
        #B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = self.family.IF.Bucket()
            items = d2f.items() if PY2 else list(d2f.items())
            score(result, items, docid2len, idf, meandoclen)
            L.append((result, 1))
        return L
Example #5
0
    def _python_search_wids(self, wids):
        if not wids:
            return []
        N = float(self.documentCount())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = self.family.IF.Bucket()
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = tf * idf
            L.append((result, 1))
        return L
Example #6
0
 def query_weight(self, terms):
     wids = []
     for term in terms:
         wids += self._lexicon.termToWordIds(term)
     N = float(len(self._docweight))
     sum = 0.0
     for wid in self._remove_oov_wids(wids):
         wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
         sum += wt**2.0
     return math.sqrt(sum)
Example #7
0
 def query_weight(self, terms):
     wids = []
     for term in terms:
         wids += self._lexicon.termToWordIds(term)
     N = float(len(self._docweight))
     sum = 0.0
     for wid in self._remove_oov_wids(wids):
         wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
         sum += wt ** 2.0
     return math.sqrt(sum)
Example #8
0
 def _search_wids(self, wids):
     if not wids:
         return []
     N = float(len(self._docweight))
     L = []
     DictType = type({})
     for wid in wids:
         assert wid in self._wordinfo  # caller responsible for OOV
         d2w = self._wordinfo[wid]  # maps docid to w(docid, wid)
         idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
         #print "idf = %.3f" % idf
         if isinstance(d2w, DictType):
             d2w = self.family.IF.Bucket(d2w)
         L.append((d2w, idf))
     return L
Example #9
0
 def _search_wids(self, wids):
     if not wids:
         return []
     N = float(len(self._docweight))
     L = []
     DictType = type({})
     for wid in wids:
         assert self._wordinfo.has_key(wid)  # caller responsible for OOV
         d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
         idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
         #print "idf = %.3f" % idf
         if isinstance(d2w, DictType):
             d2w = IFBucket(d2w)
         L.append((d2w, idf))
     return L
Example #10
0
 def query_weight(self, terms):
     # Get the wids.
     wids = []
     for term in terms:
         termwids = self._lexicon.termToWordIds(term)
         wids.extend(termwids)
     # The max score for term t is the maximum value of
     #     TF(D, t) * IDF(Q, t)
     # We can compute IDF directly, and as noted in the comments below
     # TF(D, t) is bounded above by 1+K1.
     N = float(len(self._docweight))
     tfmax = 1.0 + self.K1
     sum = 0
     for t in self._remove_oov_wids(wids):
         idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
         sum += idf * tfmax
     return sum
Example #11
0
 def query_weight(self, terms):
     # Get the wids.
     wids = []
     for term in terms:
         termwids = self._lexicon.termToWordIds(term)
         wids.extend(termwids)
     # The max score for term t is the maximum value of
     #     TF(D, t) * IDF(Q, t)
     # We can compute IDF directly, and as noted in the comments below
     # TF(D, t) is bounded above by 1+K1.
     N = float(len(self._docweight))
     tfmax = 1.0 + self.K1
     sum = 0
     for t in self._remove_oov_wids(wids):
         idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
         sum += idf * tfmax
     return sum
Example #12
0
    def _search_wids_NOTYET(self, wids):
        if not wids:
            return []
        N = float(len(self._docweight))  # total # of docs
        meandoclen = self._totaldoclen / N
        #K1 = self.K1
        #B = self.B
        #K1_plus1 = K1 + 1.0
        #B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IFBucket()
            score(result, d2f.items(), docid2len, idf, meandoclen)
            L.append((result, 1))
        return L