Exemple #1
0
            freq = int(p.payload)
            doclen = index.get_doc_meta(doc_id)['length']
            wtd = tfidf(index, t, freq, doclen)
            scores[doc_id] += wtq * wtd
            doc_norm[doc_id] += wtd * wtd

    # `scores` at this points holds the counter of the cosine formula
    # we need to perform normslization dividing by sqrt(q_norm * doc_norm)
    for doc_id, score in scores.iteritems():
        scores[doc_id] = scores[doc_id] / math.sqrt(q_norm * doc_norm[doc_id])

    return scores


if __name__ == "__main__":

    # Load index
    index = Index()
    index.load_from_file("../data/index.txt", "../data/meta.txt")

    # Input query
    query = "financial japan world news"

    # Retrieve documents using the vector space model
    res = retrieve_vsm(index, query)

    # Print relevance scores and document titles for the top 10 results
    for doc_id in sorted(res, key=res.get, reverse=True)[:10]:
        docmeta = index.get_doc_meta(doc_id)
        print res[doc_id], docmeta['title']