def _get_frequencies(self, wids): d = {} dget = d.get for wid in wids: d[wid] = dget(wid, 0) + 1 Wsquares = 0.0 for wid, count in d.items(): w = doc_term_weight(count) Wsquares += w * w d[wid] = w W = math.sqrt(Wsquares) for wid, weight in d.items(): d[wid] = scaled_int(weight / W) return d, scaled_int(W)
def _search_wids(self, wids): if not wids: return [] N = float(self.document_count()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N K1 = self.K1 B = self.B K1_plus1 = K1 + 1.0 B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IIBucket() for docid, f in d2f.items(): lenweight = B_from1 + B * docid2len[docid] / meandoclen tf = f * K1_plus1 / (f + K1 * lenweight) result[docid] = scaled_int(tf * idf) L.append((result, 1)) return L
def _ranking_idf(self): word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2] idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39] for i in range(len(self.words)): word = self.words[i] eq(word_freqs[i], self.index._get_ft(word)) eq(scaled_int(idfs[i]), self.index._get_wt(word))
def _get_frequencies(self, wids): d = {} dget = d.get for wid in wids: d[wid] = dget(wid, 0) + 1 Wsquares = 0.0 for wid, count in d.items(): w = doc_term_weight(count) Wsquares += w * w d[wid] = w W = math.sqrt(Wsquares) #print "W = %.3f" % W for wid, weight in d.items(): #print i, ":", "%.3f" % weight, d[wid] = scaled_int(weight / W) #print "->", d[wid] return d, scaled_int(W)
def query_weight(self, terms): wids = [] for term in terms: wids += self._lexicon.termToWordIds(term) N = float(self.document_count()) sum = 0.0 for wid in self._remove_oov_wids(wids): wt = inverse_doc_frequency(len(self._wordinfo[wid]), N) sum += wt ** 2.0 return scaled_int(math.sqrt(sum))
def query_weight(self, terms): wids = [] for term in terms: wids += self._lexicon.termToWordIds(term) N = float(self.document_count()) sum = 0.0 for wid in self._remove_oov_wids(wids): wt = inverse_doc_frequency(len(self._wordinfo[wid]), N) sum += wt**2.0 return scaled_int(math.sqrt(sum))
def _ranking_tf(self): # matrix of term weights for the rows are docids # and the columns are indexes into this list: l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0), (0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0), (1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0), (0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)] l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41] for i in range(len(l_Wd)): docid = i + 1 scaled_Wd = scaled_int(l_Wd[i]) eq(scaled_Wd, self.index._get_Wd(docid)) wdts = [scaled_int(t) for t in l_wdt[i]] for j in range(len(wdts)): wdt = self.index._get_wdt(docid, self.words[j]) eq(wdts[j], wdt)
def _ranking_queries(self): queries = ['eat', 'porridge', 'hot OR porridge', 'eat OR nine OR day OR old OR porridge'] wqs = [1.95, 1.10, 1.77, 3.55] results = [[(6, 0.71)], [(1, 0.61), (2, 0.58), (5, 0.71)], [(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)], [(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]] for i in range(len(queries)): raw = queries[i] q = QueryParser(self.lexicon).parseQuery(raw) wq = self.index.query_weight(q.terms()) eq(wq, scaled_int(wqs[i])) r, n = self.zc_index.query(raw) self.assertEqual(len(r), len(results[i])) # convert the results to a dict for each checking d = {} for doc, score in results[i]: d[doc] = scaled_int(score) for doc, score in r: score = scaled_int(float(score / SCALE_FACTOR) / wq) self.assertTrue(0 <= score <= SCALE_FACTOR) eq(d[doc], score)
def _search_wids(self, wids): if not wids: return [] N = float(self.document_count()) L = [] DictType = type({}) for wid in wids: assert wid in self._wordinfo # caller responsible for OOV d2w = self._wordinfo[wid] # maps docid to w(docid, wid) idf = inverse_doc_frequency(len(d2w), N) # an unscaled float if isinstance(d2w, DictType): d2w = IIBucket(d2w) L.append((d2w, scaled_int(idf))) return L
def _checkAbsoluteScores(self): self.assertEqual(self.index._totaldoclen(), 6) # So the mean doc length is 2. We use that later. r, num = self.zc_index.query('one') self.assertEqual(num, 3) self.assertEqual(len(r), 3) # Because our Okapi's B parameter is > 0, and 'one' only appears # once in each doc, the verbosity hypothesis favors shorter docs. self.assertEqual([doc for doc, score in r], [1, 2, 3]) # The way the Okapi math works, a word that appears exactly once in # an average (length) doc gets tf score 1. Our second doc has # an average length, so its score should by 1 (tf) times the # inverse doc frequency of 'one'. But 'one' appears in every # doc, so its IDF is log(1 + 3/3) = log(2). self.assertEqual(r[1][1], scaled_int(inverse_doc_frequency(3, 3))) # Similarly for 'two'. r, num = self.zc_index.query('two') self.assertEqual(num, 2) self.assertEqual(len(r), 2) self.assertEqual([doc for doc, score in r], [2, 3]) self.assertEqual(r[0][1], scaled_int(inverse_doc_frequency(2, 3))) # And 'three', except that doesn't appear in an average-size doc, so # the math is much more involved. r, num = self.zc_index.query('three') self.assertEqual(num, 1) self.assertEqual(len(r), 1) self.assertEqual([doc for doc, score in r], [3]) idf = inverse_doc_frequency(1, 3) meandoclen = 2.0 lengthweight = 1.0 - OkapiIndex.B + OkapiIndex.B * 3 / meandoclen tf = (1.0 + OkapiIndex.K1) / (1.0 + OkapiIndex.K1 * lengthweight) self.assertEqual(r[0][1], scaled_int(tf * idf))
def _search_wids(self, wids): if not wids: return [] N = float(len(self._docweight)) L = [] DictType = type({}) for wid in wids: assert self._wordinfo.has_key(wid) # caller responsible for OOV d2w = self._wordinfo[wid] # maps docid to w(docid, wid) idf = inverse_doc_frequency(len(d2w), N) # an unscaled float #print "idf = %.3f" % idf if isinstance(d2w, DictType): d2w = IIBucket(d2w) L.append((d2w, scaled_int(idf))) return L
def query_weight(self, terms): # Get the wids. wids = [] for term in terms: termwids = self._lexicon.termToWordIds(term) wids.extend(termwids) # The max score for term t is the maximum value of # TF(D, t) * IDF(Q, t) # We can compute IDF directly, and as noted in the comments below # TF(D, t) is bounded above by 1+K1. N = float(len(self._docweight)) tfmax = 1.0 + self.K1 sum = 0 for t in self._remove_oov_wids(wids): idf = inverse_doc_frequency(len(self._wordinfo[t]), N) sum += scaled_int(idf * tfmax) return sum
def _get_wt(self, t): wid, = self._lexicon.termToWordIds(t) map = self._wordinfo[wid] return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)): if abs(scaled1 - scaled2) > epsilon: raise AssertionError('{0} != {1}'.format(scaled1, scaled2))
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)): if abs(scaled1 - scaled2) > epsilon: raise AssertionError, "%s != %s" % (scaled1, scaled2)