freq = int(p.payload) doclen = index.get_doc_meta(doc_id)['length'] wtd = tfidf(index, t, freq, doclen) scores[doc_id] += wtq * wtd doc_norm[doc_id] += wtd * wtd # `scores` at this points holds the counter of the cosine formula # we need to perform normslization dividing by sqrt(q_norm * doc_norm) for doc_id, score in scores.iteritems(): scores[doc_id] = scores[doc_id] / math.sqrt(q_norm * doc_norm[doc_id]) return scores if __name__ == "__main__": # Load index index = Index() index.load_from_file("../data/index.txt", "../data/meta.txt") # Input query query = "financial japan world news" # Retrieve documents using the vector space model res = retrieve_vsm(index, query) # Print relevance scores and document titles for the top 10 results for doc_id in sorted(res, key=res.get, reverse=True)[:10]: docmeta = index.get_doc_meta(doc_id) print res[doc_id], docmeta['title']