Example #1
0
def phase2(model):
    stop = Stopper()
    stopped_corpus = stop.build_stopped_inverted_index()
    stop_inv_index = stopped_corpus[0]
    stop_total_corpus = stopped_corpus[1]
    task3a_folder = os.path.join(os.getcwd(), 'task3a')
    file_name = "task3a_cosine_stopped.txt"
    r = Retriever()
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    query_dict = fa.read_queries()
    result_file = task3a_folder + '/' + file_name
    stopped_queries = stop.get_stopped_queries(query_dict)
    qe = QueryExpander(query_dict=stopped_queries,
                       filename=result_file,
                       clean=False)
    expanded_stopped_queries = qe.get_expanded_queries()
    r.run_all_queries(inverted_index=stop_inv_index,
                      total_corpus=stop_total_corpus,
                      relevance_data=relevance_data,
                      query_dict=expanded_stopped_queries,
                      model=model,
                      task_id="phase2",
                      notes="stopped_expanded",
                      store_queries='stopped_expanded')
Example #2
0
def task1(notes=''):
    r = Retriever()
    fa = FileAccess()
    query_dict = fa.read_queries()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]
    relevance_data = fa.get_relevance_data()

    for model in models:
        r.run_all_queries(inverted_index=inverted_index,
                          total_corpus=total_corpus,
                          relevance_data=relevance_data,
                          query_dict=query_dict,
                          model=model,
                          task_id="1",
                          notes=notes)
Example #3
0
def task3a(model):
    stop = Stopper()
    stopped_corpus = stop.build_stopped_inverted_index()
    stop_inv_index = stopped_corpus[0]
    stop_total_corpus = stopped_corpus[1]
    fa = FileAccess()
    r = Retriever()
    query_dict = fa.read_queries()
    relevance_data = fa.get_relevance_data()
    stopped_queries = stop.get_stopped_queries(query_dict)
    r.run_all_queries(inverted_index=stop_inv_index,
                      total_corpus=stop_total_corpus,
                      relevance_data=relevance_data,
                      query_dict=stopped_queries,
                      model=model,
                      task_id="3a",
                      notes="stopped",
                      store_queries='stopped')
Example #4
0
def snippet_generation():
    r = Retriever()
    fa = FileAccess()
    query_dict = fa.read_queries()
    query_id = raw_input('Enter the query_id: \n')
    if int(query_id) > 64 or int(query_id) < 1:
        print 'No Query exists, please enter between 1 to 64'
        return
    query = query_dict[int(query_id) - 1]
    print 'Query: ' + query
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]

    results = r.run_all_queries(inverted_index=inverted_index,
                                total_corpus=total_corpus,
                                relevance_data=relevance_data,
                                query_dict=query_dict,
                                model='cosine',
                                task_id="1",
                                notes='',
                                ret=True)

    results = results[0:4]
    snippet_dictionary = {}

    for each in results:
        docid = each[1]
        data = total_corpus[docid]
        data = " ".join(data)
        sg = SnippetGenerator()
        snippet = sg.generate_snippet(data, query)
        snippet_dictionary[docid] = snippet

    print '\n'
    for each in results:
        print 'Doc-Id: ' + each[1]
        print snippet_dictionary[each[1]]
        print '\n'
Example #5
0
def task2(model):
    fa = FileAccess()
    r = Retriever()
    query_dict = fa.read_queries()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]
    relevance_data = fa.get_relevance_data()
    task1_folder = os.path.join(os.getcwd(), 'task1')
    file_name = "task1_" + model + "_.txt"
    result_file = task1_folder + '/' + file_name
    qe = QueryExpander(query_dict=query_dict, filename=result_file, clean=True)
    expanded_queries = qe.get_expanded_queries()
    r.run_all_queries(inverted_index=inverted_index,
                      total_corpus=total_corpus,
                      relevance_data=relevance_data,
                      query_dict=expanded_queries,
                      model='cosine',
                      task_id="2",
                      notes="expanded",
                      store_queries='expanded')