예제 #1
0
def compute_psr(query, query_id, path):
    k = 5  #top docs
    n = 4  #top words
    old_results = bm25_retrieval(query, cacm_corpus, cacm_index)
    top_k_docs = []
    top_n_words = []
    for key, value in list(old_results.items())[:k]:
        top_k_docs.append(key)
    for document in top_k_docs:
        top_n_words.append(get_top_words(cacm_corpus[document], n))
    expanded = query + " " + " ".join(make_flat_list(top_n_words))
    new_results = bm25_retrieval(expanded, cacm_corpus, cacm_index)

    write_dictionary(new_results, query_id, path, "cacm_bm25_pseudo_relevance")
def retrieve_stemmed_cacm_bm25():
    queries = get_stemmed_queries()

    for k, v in list(queries.items()):
        print(("Processing query: " + str(k)))
        write_dictionary(
            bm25_retrieval(v, cacm_stemmed_corpus, cacm_stemmed_index), k,
            "CACM_STEMMED_BM25/" + str(k) + ".txt", "cacm_stemmed_bm25")
def retrieve_stopped_cacm_bm25():
    queries = get_queries()
    stop_words = get_stopwords()
    new_query = ""
    for q in queries:
        query_terms = queries[q].split(' ')
        for qt in query_terms:
            if qt not in stop_words:
                new_query += qt + " "
        if len(new_query) != 0:
            queries[q] = new_query

    for k, v in list(queries.items()):
        print(("Processing query: " + str(k)))
        write_dictionary(bm25_retrieval(v, cacm_corpus, cacm_index), k,
                         "CACM_STOPPED_BM25/" + str(k) + ".txt",
                         "cacm_stopped_bm25")
def retrieve_cacm_bm25():
    queries = get_queries()
    for k, v in list(queries.items()):
        print(("Processing query: " + str(k)))
        write_dictionary(bm25_retrieval(v, cacm_corpus, cacm_index), k,
                         "SNIPPET_BM25/" + str(k) + ".txt", "snippet_bm25", v)