def testMany(self): import random inputs = [(-i, i) for i in range(50)] reversed_inputs = inputs[:] reversed_inputs.reverse() # Test the N-best for a variety of n (1, 6, 11, ... 50). for n in range(1, len(inputs)+1, 5): expected = inputs[-n:] expected.reverse() random_inputs = inputs[:] random.shuffle(random_inputs) for source in inputs, reversed_inputs, random_inputs: # Try feeding them one at a time. nb = NBest(n) for item, score in source: nb.add(item, score) self.assertEqual(len(nb), n) self.assertEqual(nb.capacity(), n) self.assertEqual(nb.getbest(), expected) # And again in one gulp. nb = NBest(n) nb.addmany(source) self.assertEqual(len(nb), n) self.assertEqual(nb.capacity(), n) self.assertEqual(nb.getbest(), expected) for i in range(1, n+1): self.assertEqual(nb.pop_smallest(), expected[-i]) self.assertRaises(IndexError, nb.pop_smallest)
def testMany(self): import random inputs = [(-i, i) for i in range(50)] reversed_inputs = inputs[:] reversed_inputs.reverse() # Test the N-best for a variety of n (1, 6, 11, ... 50). for n in range(1, len(inputs) + 1, 5): expected = inputs[-n:] expected.reverse() random_inputs = inputs[:] random.shuffle(random_inputs) for source in inputs, reversed_inputs, random_inputs: # Try feeding them one at a time. nb = NBest(n) for item, score in source: nb.add(item, score) self.assertEqual(len(nb), n) self.assertEqual(nb.capacity(), n) self.assertEqual(nb.getbest(), expected) # And again in one gulp. nb = NBest(n) nb.addmany(source) self.assertEqual(len(nb), n) self.assertEqual(nb.capacity(), n) self.assertEqual(nb.getbest(), expected) for i in range(1, n + 1): self.assertEqual(nb.pop_smallest(), expected[-i]) self.assertRaises(IndexError, nb.pop_smallest)
def query(self, query, nbest=10): # returns a total hit count and a mapping from docids to scores parser = QueryParser(self.lexicon) tree = parser.parseQuery(query) results = tree.executeQuery(self.index) if results is None: return [], 0 chooser = NBest(nbest) chooser.addmany(results.items()) return chooser.getbest(), len(results)
def testOne(self): nb = NBest(1) nb.add('a', 0) self.assertEqual(nb.getbest(), [('a', 0)]) nb.add('b', 1) self.assertEqual(len(nb), 1) self.assertEqual(nb.capacity(), 1) self.assertEqual(nb.getbest(), [('b', 1)]) nb.add('c', -1) self.assertEqual(len(nb), 1) self.assertEqual(nb.capacity(), 1) self.assertEqual(nb.getbest(), [('b', 1)]) nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)]) self.assertEqual(len(nb), 1) self.assertEqual(nb.capacity(), 1) self.assertEqual(nb.getbest(), [('f', 5)])
def main(rt): index = rt["index"] files = rt["files"] times = {} ITERS = range(50) for i in range(11): for q in QUERIES: terms = q.split() for c in " OR ", " AND ": query = c.join(terms) t0 = clock() if TEXTINDEX: if c == " OR ": op = Or else: op = And _q = " ".join(terms) for _ in ITERS: b = index.query(_q, op).bucket() num = len(b) chooser = NBest(10) chooser.addmany(b.items()) results = chooser.getbest() else: try: for _ in ITERS: results, num = index.query(query) except: continue t1 = clock() print "<p>Query: \"%s\"" % query print "<br>Num results: %d" % num print "<br>time.clock(): %s" % (t1 - t0) key = query if i == 0: print "<ol>" for docid, score in results: url = path2url(files[docid]) fmt = '<li><a href="%s">%s</A> score = %s' print fmt % (url, url, score) print "</ol>" continue l = times.setdefault(key, []) l.append(t1 - t0) l = times.keys() l.sort() print "<hr>" for k in l: v = times[k] print "<p>Query: \"%s\"" % k print "<br>Min time: %s" % min(v) print "<br>All times: %s" % " ".join(map(str, v))
def query(self, query, nbest=10): """Return pair (mapping from docids to scores, num results). The num results is the total number of results before trimming to the nbest results. """ tree = QueryParser(self.getLexicon()).parseQuery(query) results = tree.executeQuery(self.index) if results is None: return [], 0 chooser = NBest(nbest) chooser.addmany(results.items()) return chooser.getbest(), len(results)
def cosine_ranking(self, index, hits=250): """ Calculate the ranking of the document based on the cosine rule. """ IDF = {} # mapping term -> inverse document frequency cache = {} # mapping term -> found docids wid_cache = {} # mapping term -> wid N = len(index) # length of collection nbest = NBest(hits) for term in self.words().keys(): wid_cache[term] = wid = index.getLexicon().getWordId(term) docids = index.getStorage().getDocumentIdsForWordId(wid) cache[term] = docids # term frequence = number of documents a term appears in tf = len(docids) # calc and store the inverse document frequency given as # log(1+N/TF) if tf == 0: IDF[term] = 0 else: IDF[term] = log(1.0 + N / tf) terms = list(self.words().keys()) num_terms = len(terms) get_frequency = index.getStorage().getWordFrequency for docid in self.docIds(): # iterate over all found documents rank = 0.0 # ranking total_dwt = 0.0 # document weight for term in terms: if not docid in cache[term]: continue # document term frequency = the number of times a term # appears within a particular document try: dtf = get_frequency(docid, wid_cache[term]) except KeyError: continue # document term weight = the weight of a term within a # document and is calculated as: dtw = (1.0 + log(dtf)) * IDF[term] # query term frequency and query max frequency are set # to 1 by default qtf = qmf = 1 # query term weight is the weight given to each term in the # query and is calculated as: qtw = (0.5 + (0.5 * qtf/qmf)) * IDF[term] * self.words()[term] # add this stuff to the ranking rank += (qtw * dtw) total_dwt += (dtw * dtw) # print 'q:%12d/%10s: dtf=%8.5f dtw=%8.5f rank=%8.5f totaldtw=%8.5f' % (docid, term.encode('iso-8859-15'),dtf, dtw,rank, total_dwt) total_dwt = sqrt(total_dwt) if total_dwt == 0: rank = 0 else: # print "\t",rank, total_dwt, rank/total_dwt # rank = rank / total_dwt # normalization rank = rank / num_terms rank = int(rank * 1000 + 0.5) # scale rank to be an integer nbest.add(docid, rank) self._result = IIBTree() for docid, score in nbest.getbest(): self._result[docid] = score