def main(): # File_path is a class that stores the file paths for required documents. f = File_path() f.declare_paths() corpus = Create_corpus(f.raw_files_folder, True, True) corpus.parse_files(f.raw_files_folder, f.parsed_file_folder, True , True) a = Indexer() a.create_unigram_index(f.parsed_file_folder,f.index_file_path) c = Context() index = c.read_inverted_index(f.index_file_path) DL = c.calculate_document_length(f.parsed_file_folder) AvDL = c.calculate_avg_doc_length(f.parsed_file_folder) q = Query_Parser() q.parse_queries(f.query_file_path,f.parsed_query_file_path) f1 = open(f.parsed_query_file_path,"r") query = dict() for lines in f1: lines = lines.split(":") query[lines[0]] = lines[1].strip() bm = BM25WithRelevance("BM25WithRelevance") bm.retrieve_bm25_scores(query,f.parsed_file_folder,AvDL,DL,index, f.relevance_file_path, f.output_folder_path) tf = Tf_idf("TfIdfRanking") tf.retrieve_tfidf_scores(DL,query,index,f.output_folder_path) q = QueryLikelihood("QLModel") q.retrieve_QL_scores(DL, query,index,f.output_folder_path) # task 2 - pseudo relevance feedback pr = PseudoRelFeedback() pr.PRmain(f.parsed_file_folder,f.index_file_path,f.parsed_query_file_path,f.relevance_file_path,f.stop_file_path,f.output_folder_path) # task 3 - stemmed queries t = Task3() t.driver_stemmed(f) t.ranking_with_stopwords(f) # phase 2 - Snippet generation sg = SnippetGeneration(f.raw_files_folder) sg.get_queries(f.parsed_query_file_path) output_file_path = f.output_folder_path+"/"+"BM25WithRelevance"+".txt" sg.get_ranklist(output_file_path) sg.generate_snippet(f.snippet_file)
def driver_stemmed(self,f): s = Stemmed_parser(f.stemmed_file, f.stemmed_query_file) dict = s.get_stem_documents() s.create_files_from_dictionary(dict,f.stemmed_folder_path) query = s.create_queryList_stemmed_query(f.stemmed_query_file) a = Indexer() a.create_unigram_index(f.stemmed_folder_path,f.stemmed_index_file_path) c = Context() index = c.read_inverted_index(f.stemmed_index_file_path) DL = c.calculate_document_length(f.stemmed_folder_path) AvDL = c.calculate_avg_doc_length(f.stemmed_folder_path) bm = BM25WithRelevance("BM25WithStemming") K = bm.calculate_K(f.stemmed_folder_path,AvDL,DL) bm.retrieve_bm25_scores(query,f.stemmed_folder_path,AvDL,DL,index, f.relevance_file_path, f.output_folder_path) tf = Tf_idf("TfidfWithStemming") tf.retrieve_tfidf_scores(DL,query,index,f.output_folder_path) ql = QueryLikelihood("QLWithStemming") ql.retrieve_QL_scores(DL, query,index,f.output_folder_path)
def ranking_with_stopwords(self,f): s = StoppedCorpus() s.stop_corpus(f.stop_corpus_folder_path,f.parsed_file_folder,f.parsed_query_file_path,f.stop_query_path,f.stop_file_path) a = Indexer() a.create_unigram_index(f.stop_corpus_folder_path,f.stop_index_file_path) c = Context() index = c.read_inverted_index(f.stop_index_file_path) DL = c.calculate_document_length(f.stop_corpus_folder_path) AvDL = c.calculate_avg_doc_length(f.stop_corpus_folder_path) bm1 = BM25WithRelevance("BM25WithStopping") f1 = open(f.parsed_query_file_path,"r") query_stopped = dict() for lines in f1: lines = lines.split(":") query_stopped[lines[0]] = lines[1].strip() bm1.retrieve_bm25_scores(query_stopped,f.stop_corpus_folder_path,AvDL,DL,index, f.relevance_file_path, f.output_folder_path) tf1 = Tf_idf("TfIdfWithStopping") tf1.retrieve_tfidf_scores(DL,query_stopped,index,f.output_folder_path) q1 = QueryLikelihood("QLModelWithStopping") q1.retrieve_QL_scores(DL, query_stopped,index,f.output_folder_path)