def _search_wids(self, wids): if not wids: return [] N = float(self.document_count()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N K1 = self.K1 B = self.B K1_plus1 = K1 + 1.0 B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IIBucket() for docid, f in d2f.items(): lenweight = B_from1 + B * docid2len[docid] / meandoclen tf = f * K1_plus1 / (f + K1 * lenweight) result[docid] = scaled_int(tf * idf) L.append((result, 1)) return L
def _search_wids(self, wids): if not wids: return [] N = float(self.document_count()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N #K1 = self.K1 #B = self.B #K1_plus1 = K1 + 1.0 #B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IIBucket() score(result, d2f.items(), docid2len, idf, meandoclen) L.append((result, 1)) return L
def query_weight(self, terms): wids = [] for term in terms: wids += self._lexicon.termToWordIds(term) N = float(self.document_count()) sum = 0.0 for wid in self._remove_oov_wids(wids): wt = inverse_doc_frequency(len(self._wordinfo[wid]), N) sum += wt**2.0 return scaled_int(math.sqrt(sum))
def query_weight(self, terms): wids = [] for term in terms: wids += self._lexicon.termToWordIds(term) N = float(self.document_count()) sum = 0.0 for wid in self._remove_oov_wids(wids): wt = inverse_doc_frequency(len(self._wordinfo[wid]), N) sum += wt ** 2.0 return scaled_int(math.sqrt(sum))
def _search_wids(self, wids): if not wids: return [] N = float(self.document_count()) L = [] DictType = type({}) for wid in wids: assert wid in self._wordinfo # caller responsible for OOV d2w = self._wordinfo[wid] # maps docid to w(docid, wid) idf = inverse_doc_frequency(len(d2w), N) # an unscaled float if isinstance(d2w, DictType): d2w = IIBucket(d2w) L.append((d2w, scaled_int(idf))) return L
def _checkAbsoluteScores(self): self.assertEqual(self.index._totaldoclen(), 6) # So the mean doc length is 2. We use that later. r, num = self.zc_index.query('one') self.assertEqual(num, 3) self.assertEqual(len(r), 3) # Because our Okapi's B parameter is > 0, and 'one' only appears # once in each doc, the verbosity hypothesis favors shorter docs. self.assertEqual([doc for doc, score in r], [1, 2, 3]) # The way the Okapi math works, a word that appears exactly once in # an average (length) doc gets tf score 1. Our second doc has # an average length, so its score should by 1 (tf) times the # inverse doc frequency of 'one'. But 'one' appears in every # doc, so its IDF is log(1 + 3/3) = log(2). self.assertEqual(r[1][1], scaled_int(inverse_doc_frequency(3, 3))) # Similarly for 'two'. r, num = self.zc_index.query('two') self.assertEqual(num, 2) self.assertEqual(len(r), 2) self.assertEqual([doc for doc, score in r], [2, 3]) self.assertEqual(r[0][1], scaled_int(inverse_doc_frequency(2, 3))) # And 'three', except that doesn't appear in an average-size doc, so # the math is much more involved. r, num = self.zc_index.query('three') self.assertEqual(num, 1) self.assertEqual(len(r), 1) self.assertEqual([doc for doc, score in r], [3]) idf = inverse_doc_frequency(1, 3) meandoclen = 2.0 lengthweight = 1.0 - OkapiIndex.B + OkapiIndex.B * 3 / meandoclen tf = (1.0 + OkapiIndex.K1) / (1.0 + OkapiIndex.K1 * lengthweight) self.assertEqual(r[0][1], scaled_int(tf * idf))
def _search_wids(self, wids): if not wids: return [] N = float(len(self._docweight)) L = [] DictType = type({}) for wid in wids: assert self._wordinfo.has_key(wid) # caller responsible for OOV d2w = self._wordinfo[wid] # maps docid to w(docid, wid) idf = inverse_doc_frequency(len(d2w), N) # an unscaled float #print "idf = %.3f" % idf if isinstance(d2w, DictType): d2w = IIBucket(d2w) L.append((d2w, scaled_int(idf))) return L
def query_weight(self, terms): # Get the wids. wids = [] for term in terms: termwids = self._lexicon.termToWordIds(term) wids.extend(termwids) # The max score for term t is the maximum value of # TF(D, t) * IDF(Q, t) # We can compute IDF directly, and as noted in the comments below # TF(D, t) is bounded above by 1+K1. N = float(len(self._docweight)) tfmax = 1.0 + self.K1 sum = 0 for t in self._remove_oov_wids(wids): idf = inverse_doc_frequency(len(self._wordinfo[t]), N) sum += scaled_int(idf * tfmax) return sum
def _search_wids(self, wids): # The workhorse. Return a list of (IIBucket, weight) pairs, one pair # for each wid t in wids. The IIBucket, times the weight, maps D to # TF(D,t) * IDF(t) for every docid D containing t. # As currently written, the weights are always 1, and the IIBucket maps # D to TF(D,t)*IDF(t) directly, where the product is computed # as a float but stored as a scaled_int. # Cautions: _search_wids hardcodes the the scaled_int function. if not wids: return [] N = float(self.document_count()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N K1 = self.K1 B = self.B K1_plus1 = K1 + 1.0 B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IIBucket() # inner score loop, was implemented in C before idf *= 1024.0 # float out part of the scaled_int computation for docid, f in d2f.items(): lenweight = B_from1 + B * docid2len[docid] / meandoclen tf = f * K1_plus1 / (f + K1 * lenweight) result[docid] = int(tf * idf + 0.5) L.append((result, 1)) return L