def task3a(model, raw_corpus_directory): project_directory = os.getcwd() p = Indexer.Parser() corpus_directory = p.build_corpus(raw_corpus_directory, stopped=True) output_directory = os.path.join(project_directory, "output") I = Indexer.InvertedIndexer(corpus_directory) I.ngram_indexer(1) # builds a unigram indexes for each word r = Retriever.Retriever( corpus_directory, I, project_directory ) # create a Retriever class, which contains different retrieval model os.chdir(raw_corpus_directory) os.chdir(os.pardir) f = open('cacm.query.txt', 'r') soup = BeautifulSoup(f.read(), 'html.parser') f.close() f_stop_words = open('common_words.txt', 'r') stop_words_list = f_stop_words.readlines() stop_words = [i.strip() for i in stop_words_list] f_stop_words.close() file_name = os.path.join(output_directory, 'task3a_' + model + '.txt') f = open(file_name, 'w') # open file for writing results for i in range(64): query_no = (soup.find('docno')).text.encode( 'utf-8') # extract query number and query (soup.find('docno')).decompose() query = (soup.find('doc')).text.encode('utf-8') (soup.find('doc')).decompose() r.process_query(query, stopped=True, stopwords=stop_words) # parse the query # r.clean_content(query) docs_and_scores = r.get_scores_for_docs( model, int(query_no)) # retrieve relevant documents # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i+1)) + " " \ + str(scores[i]) + " " \ + model + "\n") f.close()
reverse=True) expanded_query_list = [i for i, j in top_terms][:len(query_terms) + n] expanded_query = " ".join(expanded_query_list) return expanded_query print("Select a task number\n\t1. Task - 1 \n\t2. Task - 2") task = int(raw_input()) model = raw_input("Enter the model: ") raw_corpus_directory = raw_input( "Enter the raw corpus directory (html files): ") project_directory = os.getcwd() output_directory = os.path.join(project_directory, "output") # Parser (to process the raw corpus (no stopping)) p = Indexer.Parser() corpus_directory = p.build_corpus(raw_corpus_directory) # Indexer - Builds the inverted indexes for the processed corpus I = Indexer.InvertedIndexer(corpus_directory) I.ngram_indexer(1) # builds a unigram indexes for each word # Retriever - based on the model specified, this object can be # used to get the results. r = Retriever.Retriever(corpus_directory, I, project_directory) # Get the queries from the given file query_dic = {} # stores the queries; key - query ID, token - query os.chdir(project_directory) f = open('cacm.query.txt', 'r') soup = BeautifulSoup(f.read(), 'html.parser')
def run_task(task, model, raw_corpus_directory): project_directory = os.getcwd() output_directory = os.path.join(project_directory, "output") # Parser (to process the raw corpus (no stopping)) p = Indexer.Parser() corpus_directory = p.build_corpus(raw_corpus_directory) # Indexer - Builds the inverted indexes for the processed corpus I = Indexer.InvertedIndexer(corpus_directory) I.ngram_indexer(1) # builds a unigram indexes for each word # Retriever - based on the model specified, this object can be # used to get the results. r = Retriever.Retriever(corpus_directory, I, project_directory) # Get the queries from the given file query_dic = {} # stores the queries; key - query ID, token - query os.chdir(project_directory) f = open('cacm.query.txt', 'r') soup = BeautifulSoup(f.read(), 'html.parser') f.close() for i in range(64): query_no = (soup.find('docno')).text.encode( 'utf-8') # extract query number and query (soup.find('docno')).decompose() query = (soup.find('doc')).text.encode('utf-8') (soup.find('doc')).decompose() query_dic[int(query_no)] = query # task 1 if task == 1: os.chdir(project_directory) if not os.path.exists(output_directory): os.mkdir(output_directory, 0755) os.chdir(output_directory) f = open('task1_' + model + '.txt', 'w') for query_no in range(len(query_dic)): r.process_query(query_dic[query_no + 1]) # parse the query docs_and_scores = r.get_scores_for_docs( model, (query_no + 1)) # retrieve relevant documents # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no + 1) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i+1)) + " " \ + str(scores[i]) + " " \ + model + "\n") f.close() # task 2 if task == 2: # read output files from task 1 file_name = 'task1_' + model + '.txt' try: f = open(os.path.join(output_directory, file_name), 'r') except: print "Run Task - 1 before Task - 2" exit() data = f.readlines() f.close() list_of_lines = [] for each_line in data: list_of_lines.append(each_line.split( )) # contains parsed lines from the task1 output file task1_output = { } # results for each; key = query ID(number), value = list of relevant files for each_line in list_of_lines: task1_output.setdefault(int(each_line[0]), []).append(each_line[2]) # get stopwords f_stop_words = open('common_words.txt', 'r') stop_words_list = f_stop_words.readlines() stop_words = [i.strip() for i in stop_words_list] f_stop_words.close() # get corpus os.chdir(corpus_directory) files_list = glob.glob('*.html') corpus = {} for each_file in files_list: doc_name = each_file[:len(each_file) - 5] text = open(each_file).read() corpus[doc_name] = text.split() file_name = 'expanded_queries_' + model + '.txt' f = open(os.path.join(output_directory, file_name), 'w') expanded_query_dic = {} for query_no, query in query_dic.viewitems(): processed_query = r.process_query(query, True) expanded_query_dic[query_no] = Query_Expander( processed_query, task1_output[query_no], corpus, stopwords=stop_words) f.write(str(query_no) + " " + expanded_query_dic[query_no] + "\n") f.close() file_name = 'task2_' + model + '.txt' f = open(os.path.join(output_directory, file_name), 'w') for query_no, query in expanded_query_dic.viewitems(): r.process_query(query) # parse the query # r.clean_content(query) docs_and_scores = r.get_scores_for_docs( model, query_no) # retrieve relevant documents # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i + 1)) + " " \ + str(scores[i]) + " " \ + model + "\n") f.close() print "Results stored in " + output_directory + " directory"