Ejemplo n.º 1
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = scaled_int(tf * idf)
            L.append((result, 1))
        return L
Ejemplo n.º 2
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        #K1 = self.K1
        #B = self.B
        #K1_plus1 = K1 + 1.0
        #B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            score(result, d2f.items(), docid2len, idf, meandoclen)
            L.append((result, 1))
        return L
Ejemplo n.º 3
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = scaled_int(tf * idf)
            L.append((result, 1))
        return L
Ejemplo n.º 4
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        #K1 = self.K1
        #B = self.B
        #K1_plus1 = K1 + 1.0
        #B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            score(result, d2f.items(), docid2len, idf, meandoclen)
            L.append((result, 1))
        return L
Ejemplo n.º 5
0
 def query_weight(self, terms):
     wids = []
     for term in terms:
         wids += self._lexicon.termToWordIds(term)
     N = float(self.document_count())
     sum = 0.0
     for wid in self._remove_oov_wids(wids):
         wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
         sum += wt**2.0
     return scaled_int(math.sqrt(sum))
Ejemplo n.º 6
0
 def query_weight(self, terms):
     wids = []
     for term in terms:
         wids += self._lexicon.termToWordIds(term)
     N = float(self.document_count())
     sum = 0.0
     for wid in self._remove_oov_wids(wids):
         wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
         sum += wt ** 2.0
     return scaled_int(math.sqrt(sum))
Ejemplo n.º 7
0
 def _search_wids(self, wids):
     if not wids:
         return []
     N = float(self.document_count())
     L = []
     DictType = type({})
     for wid in wids:
         assert wid in self._wordinfo  # caller responsible for OOV
         d2w = self._wordinfo[wid]  # maps docid to w(docid, wid)
         idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
         if isinstance(d2w, DictType):
             d2w = IIBucket(d2w)
         L.append((d2w, scaled_int(idf)))
     return L
Ejemplo n.º 8
0
    def _checkAbsoluteScores(self):
        self.assertEqual(self.index._totaldoclen(), 6)
        # So the mean doc length is 2.  We use that later.

        r, num = self.zc_index.query('one')
        self.assertEqual(num, 3)
        self.assertEqual(len(r), 3)

        # Because our Okapi's B parameter is > 0, and 'one' only appears
        # once in each doc, the verbosity hypothesis favors shorter docs.
        self.assertEqual([doc for doc, score in r], [1, 2, 3])

        # The way the Okapi math works, a word that appears exactly once in
        # an average (length) doc gets tf score 1.  Our second doc has
        # an average length, so its score should by 1 (tf) times the
        # inverse doc frequency of 'one'.  But 'one' appears in every
        # doc, so its IDF is log(1 + 3/3) = log(2).
        self.assertEqual(r[1][1], scaled_int(inverse_doc_frequency(3, 3)))

        # Similarly for 'two'.
        r, num = self.zc_index.query('two')
        self.assertEqual(num, 2)
        self.assertEqual(len(r), 2)
        self.assertEqual([doc for doc, score in r], [2, 3])
        self.assertEqual(r[0][1], scaled_int(inverse_doc_frequency(2, 3)))

        # And 'three', except that doesn't appear in an average-size doc, so
        # the math is much more involved.
        r, num = self.zc_index.query('three')
        self.assertEqual(num, 1)
        self.assertEqual(len(r), 1)
        self.assertEqual([doc for doc, score in r], [3])
        idf = inverse_doc_frequency(1, 3)
        meandoclen = 2.0
        lengthweight = 1.0 - OkapiIndex.B + OkapiIndex.B * 3 / meandoclen
        tf = (1.0 + OkapiIndex.K1) / (1.0 + OkapiIndex.K1 * lengthweight)
        self.assertEqual(r[0][1], scaled_int(tf * idf))
Ejemplo n.º 9
0
 def _search_wids(self, wids):
     if not wids:
         return []
     N = float(len(self._docweight))
     L = []
     DictType = type({})
     for wid in wids:
         assert self._wordinfo.has_key(wid)  # caller responsible for OOV
         d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
         idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
         #print "idf = %.3f" % idf
         if isinstance(d2w, DictType):
             d2w = IIBucket(d2w)
         L.append((d2w, scaled_int(idf)))
     return L
Ejemplo n.º 10
0
 def query_weight(self, terms):
     # Get the wids.
     wids = []
     for term in terms:
         termwids = self._lexicon.termToWordIds(term)
         wids.extend(termwids)
     # The max score for term t is the maximum value of
     #     TF(D, t) * IDF(Q, t)
     # We can compute IDF directly, and as noted in the comments below
     # TF(D, t) is bounded above by 1+K1.
     N = float(len(self._docweight))
     tfmax = 1.0 + self.K1
     sum = 0
     for t in self._remove_oov_wids(wids):
         idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
         sum += scaled_int(idf * tfmax)
     return sum
Ejemplo n.º 11
0
 def query_weight(self, terms):
     # Get the wids.
     wids = []
     for term in terms:
         termwids = self._lexicon.termToWordIds(term)
         wids.extend(termwids)
     # The max score for term t is the maximum value of
     #     TF(D, t) * IDF(Q, t)
     # We can compute IDF directly, and as noted in the comments below
     # TF(D, t) is bounded above by 1+K1.
     N = float(len(self._docweight))
     tfmax = 1.0 + self.K1
     sum = 0
     for t in self._remove_oov_wids(wids):
         idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
         sum += scaled_int(idf * tfmax)
     return sum
Ejemplo n.º 12
0
    def _search_wids(self, wids):
        # The workhorse. Return a list of (IIBucket, weight) pairs, one pair
        # for each wid t in wids. The IIBucket, times the weight, maps D to
        # TF(D,t) * IDF(t) for every docid D containing t.
        # As currently written, the weights are always 1, and the IIBucket maps
        # D to TF(D,t)*IDF(t) directly, where the product is computed
        # as a float but stored as a scaled_int.
        # Cautions: _search_wids hardcodes the the scaled_int function.

        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()

            # inner score loop, was implemented in C before
            idf *= 1024.0  # float out part of the scaled_int computation
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = int(tf * idf + 0.5)

            L.append((result, 1))
        return L
Ejemplo n.º 13
0
    def _search_wids(self, wids):
        # The workhorse. Return a list of (IIBucket, weight) pairs, one pair
        # for each wid t in wids. The IIBucket, times the weight, maps D to
        # TF(D,t) * IDF(t) for every docid D containing t.
        # As currently written, the weights are always 1, and the IIBucket maps
        # D to TF(D,t)*IDF(t) directly, where the product is computed
        # as a float but stored as a scaled_int.
        # Cautions: _search_wids hardcodes the the scaled_int function.

        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()

            # inner score loop, was implemented in C before
            idf *= 1024.0  # float out part of the scaled_int computation
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = int(tf * idf + 0.5)

            L.append((result, 1))
        return L