def phase2(model): stop = Stopper() stopped_corpus = stop.build_stopped_inverted_index() stop_inv_index = stopped_corpus[0] stop_total_corpus = stopped_corpus[1] task3a_folder = os.path.join(os.getcwd(), 'task3a') file_name = "task3a_cosine_stopped.txt" r = Retriever() fa = FileAccess() relevance_data = fa.get_relevance_data() query_dict = fa.read_queries() result_file = task3a_folder + '/' + file_name stopped_queries = stop.get_stopped_queries(query_dict) qe = QueryExpander(query_dict=stopped_queries, filename=result_file, clean=False) expanded_stopped_queries = qe.get_expanded_queries() r.run_all_queries(inverted_index=stop_inv_index, total_corpus=stop_total_corpus, relevance_data=relevance_data, query_dict=expanded_stopped_queries, model=model, task_id="phase2", notes="stopped_expanded", store_queries='stopped_expanded')
def task1(notes=''): r = Retriever() fa = FileAccess() query_dict = fa.read_queries() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] relevance_data = fa.get_relevance_data() for model in models: r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=query_dict, model=model, task_id="1", notes=notes)
def task3a(model): stop = Stopper() stopped_corpus = stop.build_stopped_inverted_index() stop_inv_index = stopped_corpus[0] stop_total_corpus = stopped_corpus[1] fa = FileAccess() r = Retriever() query_dict = fa.read_queries() relevance_data = fa.get_relevance_data() stopped_queries = stop.get_stopped_queries(query_dict) r.run_all_queries(inverted_index=stop_inv_index, total_corpus=stop_total_corpus, relevance_data=relevance_data, query_dict=stopped_queries, model=model, task_id="3a", notes="stopped", store_queries='stopped')
def snippet_generation(): r = Retriever() fa = FileAccess() query_dict = fa.read_queries() query_id = raw_input('Enter the query_id: \n') if int(query_id) > 64 or int(query_id) < 1: print 'No Query exists, please enter between 1 to 64' return query = query_dict[int(query_id) - 1] print 'Query: ' + query fa = FileAccess() relevance_data = fa.get_relevance_data() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] results = r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=query_dict, model='cosine', task_id="1", notes='', ret=True) results = results[0:4] snippet_dictionary = {} for each in results: docid = each[1] data = total_corpus[docid] data = " ".join(data) sg = SnippetGenerator() snippet = sg.generate_snippet(data, query) snippet_dictionary[docid] = snippet print '\n' for each in results: print 'Doc-Id: ' + each[1] print snippet_dictionary[each[1]] print '\n'
def task2(model): fa = FileAccess() r = Retriever() query_dict = fa.read_queries() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] relevance_data = fa.get_relevance_data() task1_folder = os.path.join(os.getcwd(), 'task1') file_name = "task1_" + model + "_.txt" result_file = task1_folder + '/' + file_name qe = QueryExpander(query_dict=query_dict, filename=result_file, clean=True) expanded_queries = qe.get_expanded_queries() r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=expanded_queries, model='cosine', task_id="2", notes="expanded", store_queries='expanded')