def main(): unigram = build_inverted_indexer() dict_idf = store_tfidfs(unigram) if os.path.exists(dest_folder + file + ".csv"): os.remove(dest_folder + file + ".csv") queries = get_parsed_queries() for key in queries: query_id = key compute_cosine_similarity(query_id.strip(), query_stopping(queries[key]), dict_idf)
def main(): file = "cosine_similiarity" system_name = "COSINE_SIM" unigram = build_inverted_indexer() dict_idf = store_tfidfs(unigram) if os.path.exists(dest_folder + file + ".csv"): os.remove(dest_folder + file + ".csv") queries = get_parsed_queries() for key in queries: query_id = key compute_cosine_similarity(query_id.strip(), queries[key], dict_idf, file, system_name)
top_kwords = 20 top_kreldocs_query = OrderedDict() sorted_scores_query = OrderedDict() vocabulary = OrderedDict() doc_vectors = OrderedDict() #relevant_vectors=list() #nonrelevant_vectors=list() query_vector = OrderedDict() folder = "cosine_results/" filename = "cosine_similiarity_task1.csv" queries = get_parsed_queries() all_counters = counters_all_docs() all_docs = all_counters.keys() def read_cosine_similarity_results(): with open(folder + filename, "r") as fd: reader = csv.reader(fd) scores_query = OrderedDict() for rows in reader: scores_query.setdefault(rows[0], []).append([rows[2], rows[3]]) sorted_scores_query = sorted(scores_query.items()) for key, value in sorted_scores_query: top_kreldocs_query.setdefault(key, []).append(value[:kdocs])
def main(): queryList = get_parsed_queries() docsCounters = create_counters() N = len(docsCounters) compute_bm25(docsCounters, N, create_query_counters(queryList), get_relevance_data())