def testMany(self): import random inputs = [(-i, i) for i in range(50)] reversed_inputs = inputs[:] reversed_inputs.reverse() # Test the N-best for a variety of n (1, 6, 11, ... 50). for n in range(1, len(inputs) + 1, 5): expected = inputs[-n:] expected.reverse() random_inputs = inputs[:] random.shuffle(random_inputs) for source in inputs, reversed_inputs, random_inputs: # Try feeding them one at a time. nb = NBest(n) for item, score in source: nb.add(item, score) self.assertEqual(len(nb), n) self.assertEqual(nb.capacity(), n) self.assertEqual(nb.getbest(), expected) # And again in one gulp. nb = NBest(n) nb.addmany(source) self.assertEqual(len(nb), n) self.assertEqual(nb.capacity(), n) self.assertEqual(nb.getbest(), expected) for i in range(1, n + 1): self.assertEqual(nb.pop_smallest(), expected[-i]) self.assertRaises(IndexError, nb.pop_smallest)
def testMany(self): import random inputs = [(-i, i) for i in range(50)] reversed_inputs = inputs[:] reversed_inputs.reverse() # Test the N-best for a variety of n (1, 6, 11, ... 50). for n in range(1, len(inputs)+1, 5): expected = inputs[-n:] expected.reverse() random_inputs = inputs[:] random.shuffle(random_inputs) for source in inputs, reversed_inputs, random_inputs: # Try feeding them one at a time. nb = NBest(n) for item, score in source: nb.add(item, score) self.assertEqual(len(nb), n) self.assertEqual(nb.capacity(), n) self.assertEqual(nb.getbest(), expected) # And again in one gulp. nb = NBest(n) nb.addmany(source) self.assertEqual(len(nb), n) self.assertEqual(nb.capacity(), n) self.assertEqual(nb.getbest(), expected) for i in range(1, n+1): self.assertEqual(nb.pop_smallest(), expected[-i]) self.assertRaises(IndexError, nb.pop_smallest)
def mass_weightedUnion(L): "A list of (mapping, weight) pairs -> their weightedUnion IIBucket." if len(L) < 2: return _trivial(L) # Balance unions as closely as possible, smallest to largest. merge = NBest(len(L)) for x, weight in L: merge.add((x, weight), len(x)) while len(merge) > 1: # Merge the two smallest so far, and add back to the queue. (x, wx), dummy = merge.pop_smallest() (y, wy), dummy = merge.pop_smallest() dummy, z = weightedUnion(x, y, wx, wy) merge.add((z, 1), len(z)) (result, weight), dummy = merge.pop_smallest() return result
def mass_weightedUnion(l_): "A list of (mapping, weight) pairs -> their weightedUnion IIBucket." if len(l_) < 2: return _trivial(l_) # Balance unions as closely as possible, smallest to largest. merge = NBest(len(l_)) for x, weight in l_: merge.add((x, weight), len(x)) while len(merge) > 1: # Merge the two smallest so far, and add back to the queue. (x, wx), dummy = merge.pop_smallest() (y, wy), dummy = merge.pop_smallest() dummy, z = weightedUnion(x, y, wx, wy) merge.add((z, 1), len(z)) (result, weight), dummy = merge.pop_smallest() return result
def testOne(self): nb = NBest(1) nb.add('a', 0) self.assertEqual(nb.getbest(), [('a', 0)]) nb.add('b', 1) self.assertEqual(len(nb), 1) self.assertEqual(nb.capacity(), 1) self.assertEqual(nb.getbest(), [('b', 1)]) nb.add('c', -1) self.assertEqual(len(nb), 1) self.assertEqual(nb.capacity(), 1) self.assertEqual(nb.getbest(), [('b', 1)]) nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)]) self.assertEqual(len(nb), 1) self.assertEqual(nb.capacity(), 1) self.assertEqual(nb.getbest(), [('f', 5)])
def cosine_ranking(self, index, hits=250): """ Calculate the ranking of the document based on the cosine rule. """ IDF = {} # mapping term -> inverse document frequency cache = {} # mapping term -> found docids wid_cache = {} # mapping term -> wid N = len(index) # length of collection nbest = NBest(hits) for term in self.words().keys(): wid_cache[term] = wid = index.getLexicon().getWordId(term) docids = index.getStorage().getDocumentIdsForWordId(wid) cache[term] = docids # term frequence = number of documents a term appears in tf = len(docids) # calc and store the inverse document frequency given as # log(1+N/TF) if tf == 0: IDF[term] = 0 else: IDF[term] = log(1.0 + N / tf) terms = list(self.words().keys()) num_terms = len(terms) get_frequency = index.getStorage().getWordFrequency for docid in self.docIds(): # iterate over all found documents rank = 0.0 # ranking total_dwt = 0.0 # document weight for term in terms: if not docid in cache[term]: continue # document term frequency = the number of times a term # appears within a particular document try: dtf = get_frequency(docid, wid_cache[term]) except KeyError: continue # document term weight = the weight of a term within a # document and is calculated as: dtw = (1.0 + log(dtf)) * IDF[term] # query term frequency and query max frequency are set # to 1 by default qtf = qmf = 1 # query term weight is the weight given to each term in the # query and is calculated as: qtw = (0.5 + (0.5 * qtf/qmf)) * IDF[term] * self.words()[term] # add this stuff to the ranking rank += (qtw * dtw) total_dwt += (dtw * dtw) # print 'q:%12d/%10s: dtf=%8.5f dtw=%8.5f rank=%8.5f totaldtw=%8.5f' % (docid, term.encode('iso-8859-15'),dtf, dtw,rank, total_dwt) total_dwt = sqrt(total_dwt) if total_dwt == 0: rank = 0 else: # print "\t",rank, total_dwt, rank/total_dwt # rank = rank / total_dwt # normalization rank = rank / num_terms rank = int(rank * 1000 + 0.5) # scale rank to be an integer nbest.add(docid, rank) self._result = IIBTree() for docid, score in nbest.getbest(): self._result[docid] = score