Example #1
0
def main():
   # File_path is a class that stores the file paths for required documents.
    f = File_path()
    f.declare_paths()
    corpus = Create_corpus(f.raw_files_folder, True, True)
    corpus.parse_files(f.raw_files_folder, f.parsed_file_folder, True , True)

    a = Indexer()
    a.create_unigram_index(f.parsed_file_folder,f.index_file_path)

    c = Context()
    index = c.read_inverted_index(f.index_file_path)
    DL = c.calculate_document_length(f.parsed_file_folder)
    AvDL = c.calculate_avg_doc_length(f.parsed_file_folder)
    q = Query_Parser()
    q.parse_queries(f.query_file_path,f.parsed_query_file_path)

    f1 = open(f.parsed_query_file_path,"r")
    query = dict()
    for lines in f1:
        lines = lines.split(":")
        query[lines[0]] = lines[1].strip()
    bm = BM25WithRelevance("BM25WithRelevance")
    bm.retrieve_bm25_scores(query,f.parsed_file_folder,AvDL,DL,index, f.relevance_file_path, f.output_folder_path)

    tf = Tf_idf("TfIdfRanking")
    tf.retrieve_tfidf_scores(DL,query,index,f.output_folder_path)

    q = QueryLikelihood("QLModel")
    q.retrieve_QL_scores(DL, query,index,f.output_folder_path)

   # task 2 - pseudo relevance feedback
    pr = PseudoRelFeedback()
    pr.PRmain(f.parsed_file_folder,f.index_file_path,f.parsed_query_file_path,f.relevance_file_path,f.stop_file_path,f.output_folder_path)

    # task 3 - stemmed queries
    t = Task3()
    t.driver_stemmed(f)
    t.ranking_with_stopwords(f)

    # phase 2 - Snippet generation
    sg = SnippetGeneration(f.raw_files_folder)
    sg.get_queries(f.parsed_query_file_path)
    output_file_path = f.output_folder_path+"/"+"BM25WithRelevance"+".txt"
    sg.get_ranklist(output_file_path)
    sg.generate_snippet(f.snippet_file)
Example #2
0
 def driver_stemmed(self,f):
     s = Stemmed_parser(f.stemmed_file, f.stemmed_query_file)
     dict = s.get_stem_documents()
     s.create_files_from_dictionary(dict,f.stemmed_folder_path)
     query = s.create_queryList_stemmed_query(f.stemmed_query_file)
     a = Indexer()
     a.create_unigram_index(f.stemmed_folder_path,f.stemmed_index_file_path)
     c = Context()
     index = c.read_inverted_index(f.stemmed_index_file_path)
     DL = c.calculate_document_length(f.stemmed_folder_path)
     AvDL = c.calculate_avg_doc_length(f.stemmed_folder_path)
     bm = BM25WithRelevance("BM25WithStemming")
     K = bm.calculate_K(f.stemmed_folder_path,AvDL,DL)
     bm.retrieve_bm25_scores(query,f.stemmed_folder_path,AvDL,DL,index, f.relevance_file_path, f.output_folder_path)
     tf = Tf_idf("TfidfWithStemming")
     tf.retrieve_tfidf_scores(DL,query,index,f.output_folder_path)
     ql = QueryLikelihood("QLWithStemming")
     ql.retrieve_QL_scores(DL, query,index,f.output_folder_path)
Example #3
0
    def ranking_with_stopwords(self,f):
        s = StoppedCorpus()
        s.stop_corpus(f.stop_corpus_folder_path,f.parsed_file_folder,f.parsed_query_file_path,f.stop_query_path,f.stop_file_path)
        a = Indexer()
        a.create_unigram_index(f.stop_corpus_folder_path,f.stop_index_file_path)
        c = Context()
        index = c.read_inverted_index(f.stop_index_file_path)
        DL = c.calculate_document_length(f.stop_corpus_folder_path)
        AvDL = c.calculate_avg_doc_length(f.stop_corpus_folder_path)
        bm1 = BM25WithRelevance("BM25WithStopping")
        f1 = open(f.parsed_query_file_path,"r")
        query_stopped = dict()
        for lines in f1:
            lines = lines.split(":")
            query_stopped[lines[0]] = lines[1].strip()
        bm1.retrieve_bm25_scores(query_stopped,f.stop_corpus_folder_path,AvDL,DL,index, f.relevance_file_path, f.output_folder_path)

        tf1 = Tf_idf("TfIdfWithStopping")
        tf1.retrieve_tfidf_scores(DL,query_stopped,index,f.output_folder_path)

        q1 = QueryLikelihood("QLModelWithStopping")
        q1.retrieve_QL_scores(DL, query_stopped,index,f.output_folder_path)