def compute_psr(query, query_id, path): k = 5 #top docs n = 4 #top words old_results = bm25_retrieval(query, cacm_corpus, cacm_index) top_k_docs = [] top_n_words = [] for key, value in list(old_results.items())[:k]: top_k_docs.append(key) for document in top_k_docs: top_n_words.append(get_top_words(cacm_corpus[document], n)) expanded = query + " " + " ".join(make_flat_list(top_n_words)) new_results = bm25_retrieval(expanded, cacm_corpus, cacm_index) write_dictionary(new_results, query_id, path, "cacm_bm25_pseudo_relevance")
def retrieve_stemmed_cacm_bm25(): queries = get_stemmed_queries() for k, v in list(queries.items()): print(("Processing query: " + str(k))) write_dictionary( bm25_retrieval(v, cacm_stemmed_corpus, cacm_stemmed_index), k, "CACM_STEMMED_BM25/" + str(k) + ".txt", "cacm_stemmed_bm25")
def retrieve_stopped_cacm_bm25(): queries = get_queries() stop_words = get_stopwords() new_query = "" for q in queries: query_terms = queries[q].split(' ') for qt in query_terms: if qt not in stop_words: new_query += qt + " " if len(new_query) != 0: queries[q] = new_query for k, v in list(queries.items()): print(("Processing query: " + str(k))) write_dictionary(bm25_retrieval(v, cacm_corpus, cacm_index), k, "CACM_STOPPED_BM25/" + str(k) + ".txt", "cacm_stopped_bm25")
def retrieve_cacm_bm25(): queries = get_queries() for k, v in list(queries.items()): print(("Processing query: " + str(k))) write_dictionary(bm25_retrieval(v, cacm_corpus, cacm_index), k, "SNIPPET_BM25/" + str(k) + ".txt", "snippet_bm25", v)