# Base query class (parent of all query classes) class Query(object): def __init__(self): pass def get_matches(self, index): # TODO, return all documents IDs (as a set) from the index pass # Query containing a single search term class TermQuery(Query): def __init__(self, term): self.term = term def get_matches(self, index): # TODO, return all documents IDs (as a set) that contain the search term pass # Load index index = Index() index.load_from_file("data/index.txt") # TODO, construct the following (or similar) queries and get results # - "states" # - "NOT washington" # - "united AND states" # - "(us OR (united AND states)) AND NOT washington"
freq = int(p.payload) doclen = index.get_doc_meta(doc_id)['length'] wtd = tfidf(index, t, freq, doclen) scores[doc_id] += wtq * wtd doc_norm[doc_id] += wtd * wtd # `scores` at this points holds the counter of the cosine formula # we need to perform normslization dividing by sqrt(q_norm * doc_norm) for doc_id, score in scores.iteritems(): scores[doc_id] = scores[doc_id] / math.sqrt(q_norm * doc_norm[doc_id]) return scores if __name__ == "__main__": # Load index index = Index() index.load_from_file("../data/index.txt", "../data/meta.txt") # Input query query = "financial japan world news" # Retrieve documents using the vector space model res = retrieve_vsm(index, query) # Print relevance scores and document titles for the top 10 results for doc_id in sorted(res, key=res.get, reverse=True)[:10]: docmeta = index.get_doc_meta(doc_id) print res[doc_id], docmeta['title']