def _python_search_wids(self, wids): if not wids: return [] N = float(self.documentCount()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N K1 = self.K1 B = self.B K1_plus1 = K1 + 1.0 B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = self.family.IF.Bucket() for docid, f in d2f.items(): lenweight = B_from1 + B * docid2len[docid] / meandoclen tf = f * K1_plus1 / (f + K1 * lenweight) result[docid] = tf * idf L.append((result, 1)) return L
def _c_search_wids(self, wids): if not wids: return [] N = float(self.documentCount()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N #K1 = self.K1 #B = self.B #K1_plus1 = K1 + 1.0 #B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = self.family.IF.Bucket() items = d2f.items() if PY2 else list(d2f.items()) score(result, items, docid2len, idf, meandoclen) L.append((result, 1)) return L
def _search_wids(self, wids): if not wids: return [] N = float(len(self._docweight)) # total # of docs meandoclen = self._totaldoclen / N K1 = self.K1 B = self.B K1_plus1 = K1 + 1.0 B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IFBucket() for docid, f in d2f.items(): lenweight = B_from1 + B * docid2len[docid] / meandoclen tf = f * K1_plus1 / (f + K1 * lenweight) result[docid] = tf * idf L.append((result, 1)) return L
def query_weight(self, terms): wids = [] for term in terms: wids += self._lexicon.termToWordIds(term) N = float(len(self._docweight)) sum = 0.0 for wid in self._remove_oov_wids(wids): wt = inverse_doc_frequency(len(self._wordinfo[wid]), N) sum += wt**2.0 return math.sqrt(sum)
def query_weight(self, terms): wids = [] for term in terms: wids += self._lexicon.termToWordIds(term) N = float(len(self._docweight)) sum = 0.0 for wid in self._remove_oov_wids(wids): wt = inverse_doc_frequency(len(self._wordinfo[wid]), N) sum += wt ** 2.0 return math.sqrt(sum)
def _search_wids(self, wids): if not wids: return [] N = float(len(self._docweight)) L = [] DictType = type({}) for wid in wids: assert wid in self._wordinfo # caller responsible for OOV d2w = self._wordinfo[wid] # maps docid to w(docid, wid) idf = inverse_doc_frequency(len(d2w), N) # an unscaled float #print "idf = %.3f" % idf if isinstance(d2w, DictType): d2w = self.family.IF.Bucket(d2w) L.append((d2w, idf)) return L
def _search_wids(self, wids): if not wids: return [] N = float(len(self._docweight)) L = [] DictType = type({}) for wid in wids: assert self._wordinfo.has_key(wid) # caller responsible for OOV d2w = self._wordinfo[wid] # maps docid to w(docid, wid) idf = inverse_doc_frequency(len(d2w), N) # an unscaled float #print "idf = %.3f" % idf if isinstance(d2w, DictType): d2w = IFBucket(d2w) L.append((d2w, idf)) return L
def query_weight(self, terms): # Get the wids. wids = [] for term in terms: termwids = self._lexicon.termToWordIds(term) wids.extend(termwids) # The max score for term t is the maximum value of # TF(D, t) * IDF(Q, t) # We can compute IDF directly, and as noted in the comments below # TF(D, t) is bounded above by 1+K1. N = float(len(self._docweight)) tfmax = 1.0 + self.K1 sum = 0 for t in self._remove_oov_wids(wids): idf = inverse_doc_frequency(len(self._wordinfo[t]), N) sum += idf * tfmax return sum
def _search_wids_NOTYET(self, wids): if not wids: return [] N = float(len(self._docweight)) # total # of docs meandoclen = self._totaldoclen / N #K1 = self.K1 #B = self.B #K1_plus1 = K1 + 1.0 #B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IFBucket() score(result, d2f.items(), docid2len, idf, meandoclen) L.append((result, 1)) return L