Example #1
0
    def generate_snippet(self, doc, query):
        fa = FileAccess()
        stop_words = fa.get_stop_words()
        query = query.split()
        stopped_content = query
        final_query = " ".join(stopped_content)

        fq_list = final_query.split()
        doc_list = doc.split()
        intr = list(set(doc_list).intersection(fq_list))

        positions = []
        for each in intr:
            if each in intr:
                key = doc_list.index(each)
                positions.append(key)
            else:
                continue
        final_doc = ''
        i = 0
        for each in doc_list:
            if i in positions:
                q = '"' + each + '" '
                final_doc += q
            else:
                final_doc += each + ' '
            i += 1

        return final_doc
Example #2
0
def phase2(model):
    stop = Stopper()
    stopped_corpus = stop.build_stopped_inverted_index()
    stop_inv_index = stopped_corpus[0]
    stop_total_corpus = stopped_corpus[1]
    task3a_folder = os.path.join(os.getcwd(), 'task3a')
    file_name = "task3a_cosine_stopped.txt"
    r = Retriever()
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    query_dict = fa.read_queries()
    result_file = task3a_folder + '/' + file_name
    stopped_queries = stop.get_stopped_queries(query_dict)
    qe = QueryExpander(query_dict=stopped_queries,
                       filename=result_file,
                       clean=False)
    expanded_stopped_queries = qe.get_expanded_queries()
    r.run_all_queries(inverted_index=stop_inv_index,
                      total_corpus=stop_total_corpus,
                      relevance_data=relevance_data,
                      query_dict=expanded_stopped_queries,
                      model=model,
                      task_id="phase2",
                      notes="stopped_expanded",
                      store_queries='stopped_expanded')
Example #3
0
    def build_stopped_corpus(self):
        cwd = os.getcwd()
        clean_cacm = os.path.join(cwd, 'clean_cacm')
        stopped_cacm = os.path.join(cwd, 'stopped_cacm')
        fa = FileAccess()

        if not os.path.exists(clean_cacm):
            print "Clean corpus doesn't exist. It is created now. " \
                  "PLease put cleaned files inside the corpus folder"
            os.makedirs(clean_cacm, 0755)
            return
        if not os.path.exists(stopped_cacm):
            os.makedirs(stopped_cacm, 0755)

        stop_words = fa.get_stop_words()
        os.chdir(clean_cacm)

        for eachfile in glob.glob('*.html'):
            print eachfile
            content = open(eachfile).read()
            content = content.split()
            stopped_content = [x for x in content if x not in stop_words]
            final_content = " ".join(stopped_content)

            clean_file = open(os.path.join(stopped_cacm, eachfile), 'w')
            clean_file.write(final_content)
            clean_file.close()
Example #4
0
def evalaution():
    p_k = [5, 20]
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    base_dir = os.getcwd()
    all_runs = os.path.join(os.getcwd(), 'all_runs')
    os.chdir(all_runs)
    e = Evaluation()

    for eachfile in glob.glob('*.txt'):
        e.evaluate(eachfile, p_k, base_dir, relevance_data)
Example #5
0
def snippet_generation():
    r = Retriever()
    fa = FileAccess()
    query_dict = fa.read_queries()
    query_id = raw_input('Enter the query_id: \n')
    if int(query_id) > 64 or int(query_id) < 1:
        print 'No Query exists, please enter between 1 to 64'
        return
    query = query_dict[int(query_id) - 1]
    print 'Query: ' + query
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]

    results = r.run_all_queries(inverted_index=inverted_index,
                                total_corpus=total_corpus,
                                relevance_data=relevance_data,
                                query_dict=query_dict,
                                model='cosine',
                                task_id="1",
                                notes='',
                                ret=True)

    results = results[0:4]
    snippet_dictionary = {}

    for each in results:
        docid = each[1]
        data = total_corpus[docid]
        data = " ".join(data)
        sg = SnippetGenerator()
        snippet = sg.generate_snippet(data, query)
        snippet_dictionary[docid] = snippet

    print '\n'
    for each in results:
        print 'Doc-Id: ' + each[1]
        print snippet_dictionary[each[1]]
        print '\n'
Example #6
0
 def __init__(self, filename, query_dict, top_k=12, n=5, clean=True):
     r = Retriever()
     if not clean:
         self.total_corpus = r.get_total_corpus(folder='stopped')
     else:
         self.total_corpus = r.get_total_corpus(folder='clean')
     self.k = top_k
     self.n = n
     fa = FileAccess()
     self.query_dict = query_dict
     self.results = fa.read_result_file(filename=filename)
     return
Example #7
0
    def get_stopped_queries(self, query_dict):
        fa = FileAccess()
        query_dict = query_dict
        stop_words = fa.get_stop_words()
        stopped_queries = {}
        for each in query_dict:
            query = query_dict[each]
            query_list = query.split()
            stopped_query = [x for x in query_list if x not in stop_words]
            stopped_query = " ".join(stopped_query)
            stopped_queries[each] = stopped_query

        return stopped_queries
Example #8
0
def task3b(model):
    stem = Stemmer()
    r = Retriever()
    stem_total_corpus = stem.build_stemmed_data()
    stem_inv_index = stem.build_stemmed_index()
    fa = FileAccess()
    relevance_data = fa.get_relevance_data()
    stemmed_queries = fa.get_stem_queries()

    r.run_all_queries(inverted_index=stem_inv_index,
                      total_corpus=stem_total_corpus,
                      relevance_data=relevance_data,
                      query_dict=stemmed_queries,
                      model=model,
                      task_id="3b",
                      notes="stemmed",
                      store_queries='stemmed')
Example #9
0
def task1(notes=''):
    r = Retriever()
    fa = FileAccess()
    query_dict = fa.read_queries()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]
    relevance_data = fa.get_relevance_data()

    for model in models:
        r.run_all_queries(inverted_index=inverted_index,
                          total_corpus=total_corpus,
                          relevance_data=relevance_data,
                          query_dict=query_dict,
                          model=model,
                          task_id="1",
                          notes=notes)
Example #10
0
def task3a(model):
    stop = Stopper()
    stopped_corpus = stop.build_stopped_inverted_index()
    stop_inv_index = stopped_corpus[0]
    stop_total_corpus = stopped_corpus[1]
    fa = FileAccess()
    r = Retriever()
    query_dict = fa.read_queries()
    relevance_data = fa.get_relevance_data()
    stopped_queries = stop.get_stopped_queries(query_dict)
    r.run_all_queries(inverted_index=stop_inv_index,
                      total_corpus=stop_total_corpus,
                      relevance_data=relevance_data,
                      query_dict=stopped_queries,
                      model=model,
                      task_id="3a",
                      notes="stopped",
                      store_queries='stopped')
Example #11
0
def main(arg=None):

    if arg is None:
        PlaylistUpdater(user="******").update()
        sys.exit(0)

    if arg != "update":
        update_all()
        sys.exit(0)

    fa = FileAccess()

    print_intro_prompt()

    while True:
        print("\nEnter a command: ")
        command = input().lower()

        ## Handle Command
        if re.search("^he*", command):
            print_help()
        elif re.search("^up*", command):
            update_all()
        elif re.search("^ne*", command):
            add_user(fa)
        elif re.search("^re*", command):
            print("Remove User")
        elif re.search("^in*", command):
            print_info(fa)
        elif re.search("^se*", command):
            schedule_cron()
        elif re.search("^pr*", command):
            print_cron()
        elif re.search("^qu*", command):
            print("Goodbye!")
            sys.exit(0)
        else:
            print("ERROR: Invalid Command")
            print("Enter 'HELP' for list of commands")

    return 0
Example #12
0
def task2(model):
    fa = FileAccess()
    r = Retriever()
    query_dict = fa.read_queries()
    corpus = r.get_corpus(True)
    inverted_index = corpus[0]
    total_corpus = corpus[1]
    relevance_data = fa.get_relevance_data()
    task1_folder = os.path.join(os.getcwd(), 'task1')
    file_name = "task1_" + model + "_.txt"
    result_file = task1_folder + '/' + file_name
    qe = QueryExpander(query_dict=query_dict, filename=result_file, clean=True)
    expanded_queries = qe.get_expanded_queries()
    r.run_all_queries(inverted_index=inverted_index,
                      total_corpus=total_corpus,
                      relevance_data=relevance_data,
                      query_dict=expanded_queries,
                      model='cosine',
                      task_id="2",
                      notes="expanded",
                      store_queries='expanded')
Example #13
0
if __name__ == "__main__":

    print(
        'Please wait for around 30 mins! It will take sometimes while getting data from OMDB API calling.....'
    )
    print(
        'If you want to run the program with less data to avoid delay then please minimize the contents of '
        'inputs folder and re-run!!!\n')

    show = Show()
    unwatched_movie_dic = {}

    try:
        '''read input text file'''
        fa = FileAccess()
        movie_list = fa.read_file(fa.movie_list_txt)
        watched_movie_list = fa.read_file(fa.watched_movie_list_txt)
        unwatched_movie_list = set(movie_list) - set(watched_movie_list)
        '''print movie, watched and unwatched movie id'''
        '''
        print('-----------> Movie Id: ')
        show.print_input_list(movie_list)
        print('-----------> Watched Movie Id: ')
        show.print_input_list(watched_movie_list)
        print('-----------> Unwatched Movie Id: ')
        show.print_input_list(unwatched_movie_list)
        '''
        '''call OMDB api and load movie related data'''
        api = OMDBApi()
        movie_dic = api.call_omdb_api(movie_list)
Example #14
0
    def evaluate(self, file_name, rank_list, base_dir, relevant_data):
        fa = FileAccess()
        scores = fa.read_score_file(file_name)
        pr_results = []
        ap_results = []
        mrr = []
        p_at_k = {}
        for each in rank_list:
            p_at_k[each] = []
        for each in scores:
            ap = 0
            if each in relevant_data:
                relevant_files = relevant_data[each]
            else:
                continue
            data = scores[each]
            total_retrieved = 1
            total_relevant_retrieved = 0
            for eachdata in data:
                qid = each
                rank = data.index(eachdata) + 1
                docid = eachdata[0]
                doc_score = eachdata[1]
                if docid in relevant_files:
                    if total_relevant_retrieved == 0:
                        mrr.append(1.0/rank)
                    total_relevant_retrieved += 1
                relevance = 1 if docid in relevant_files else 0
                precision = float(total_relevant_retrieved)/total_retrieved
                if rank in rank_list:
                    tup = (qid, precision)
                    p_at_k[rank].append(tup)
                if relevance:
                    ap += precision
                recall = float(total_relevant_retrieved)/len(relevant_files)
                total_retrieved += 1
                tup = (qid, rank, docid, doc_score, str(relevance), precision, recall)
                pr_results.append(tup)
            if total_relevant_retrieved != 0:
                avg_p = float(ap)/total_relevant_retrieved
            else:
                avg_p = 0
            ap_results.append(avg_p)

        mean_avg_pr = sum(ap_results)/len(ap_results)
        mean_rr = sum(mrr)/len(mrr)

        phase2_evaluation = os.path.join(base_dir, 'evaluation_phase2')

        if not os.path.exists(phase2_evaluation):
            os.makedirs(phase2_evaluation, 0755)

        pre_file = file_name.split('.')[0]
        for each in p_at_k:
            pk_file_name = pre_file + '_p@k'+str(each)+'.txt'
            pk_file = open(os.path.join(phase2_evaluation, pk_file_name), 'w')
            for e in p_at_k[each]:
                pk_file.write('{} {}\n'.format(e[0], e[1]))
            pk_file.close()

        mrr_filename = pre_file + '_mrr.txt'
        pr_filename = pre_file + '_precision_recall.txt'
        map_filename = pre_file + '_map_results.txt'

        mrr_file = open(os.path.join(phase2_evaluation, mrr_filename), 'w')
        mrr_file.write(str(mean_rr))
        mrr_file.close()

        map_file = open(os.path.join(phase2_evaluation, map_filename), 'w')
        map_file.write(str(mean_avg_pr))
        map_file.close()

        pr_file = open(os.path.join(phase2_evaluation, pr_filename), 'w')
        for e in pr_results:
            pr_file.write("{} {} {} {} {} {} {}\n".format(e[0], e[1], e[2], e[3], e[4], round(e[5], 3), round(e[6], 3)))
        pr_file.close()

        return