Esempio n. 1
0
def search_and_rank_query(query, inverted_index,num_docs_to_retrieve):
    p = Parse()
    dictFromQuery = {}
    p.tokenSplit(query, dictFromQuery)
    query_as_list = [*dictFromQuery]
    searcher = Searcher(inverted_index)
    #posting = utils.load_obj("posting")
    print('-------------------------------------')
    print('Start import mapReduce')
    map_reduce = MapReduce.import_map_reduce('MapReduceData/')
    print('Done importing mapReduce')
    posting = {}
    print('-------------------------------------')
    print('Start build posting file')
    for term in query_as_list:
        posting[term] = map_reduce.read_from(term)
    print('Done building posting file')
    print('-------------------------------------')
    print('Get relevant Doc')
    relevant_docs = searcher.relevant_docs_from_posting(query_as_list,posting)
    print('Done getting relevant Doc')
    print('-------------------------------------')
    print('Start ranking docs')
    ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs,dictFromQuery,posting,num_docs_to_retrieve)
    print('Done ranking docs')
    return searcher.ranker.retrieve_top_k(ranked_docs,num_docs_to_retrieve)
Esempio n. 2
0
 def create_c_of_doc(top_relevant_docs, dictFromQuery, posting):
     # load map reduce from file
     # relavent doc : # {num : [score,doc_tuple, {index}]}
     # c[term,term2] = sum[k](term1 in doc k * term2 in doc k)
     #  = > {}
     queryAsList = [*dictFromQuery]
     map_reduce = MapReduce.import_map_reduce('MapReduceData/')
     c_matrix = {}  # {term: {'other term' : value}}
     for doc_id in top_relevant_docs.keys():
         if doc_id != 'META-DATA':
             info_list = map_reduce.read_from(('Document', doc_id))
             doc_term_freq_dict = info_list
             max_freq = info_list[1]
             if len(doc_term_freq_dict) == 0:
                 continue
             doc_term_freq_dict = doc_term_freq_dict[0]
             for term_doc1, term_doc_freq1 in doc_term_freq_dict.items():
                 #for queryIndex in top_relevant_docs[doc_id][2]:
                 if term_doc1 not in c_matrix.keys():
                     c_matrix[term_doc1] = {}
                 for term_doc2, term_doc_freq2 in doc_term_freq_dict.items(
                 ):
                     if term_doc1 in dictFromQuery.keys(
                     ) or term_doc1 == term_doc2:
                         if term_doc2 not in c_matrix[term_doc1]:
                             c_matrix[term_doc1][term_doc2] = 0
                         c_matrix[term_doc1][
                             term_doc2] += term_doc_freq1 * term_doc_freq2  #Cii,Cjj,Cij
     return c_matrix
Esempio n. 3
0
def search_and_rank_query(query, inverted_index, num_docs_to_retrieve):
    p = Parse()
    dictFromQuery = {}
    map_reduce_ag = MapReduce.import_map_reduce('MapReduceData/AG/')
    map_reduce_hq = MapReduce.import_map_reduce('MapReduceData/HQ/')
    map_reduce_rz = MapReduce.import_map_reduce('MapReduceData/Rz/')
    map_reduce_other = MapReduce.import_map_reduce('MapReduceData/Others/')
    map_reduce_doc = MapReduce.import_map_reduce('MapReduceData/Document/')
    p.tokenSplit(query, dictFromQuery)
    query_as_list = [*dictFromQuery]
    searcher = Searcher(inverted_index)
    #posting = utils.load_obj("posting")
    print('-------------------------------------')
    print('Start import mapReduce')
    # map_reduce = MapReduce.import_map_reduce('MapReduceData/')

    print('Done importing mapReduce')
    posting = {}
    print('-------------------------------------')
    print('Start build posting file')
    query_as_list.sort(key=lambda x: x.lower())
    for term in query_as_list:
        lower_letter = term[0].lower()
        current_map = map_reduce_other
        if 'a' <= lower_letter <= 'g':
            current_map = map_reduce_ag
        elif 'h' <= lower_letter <= 'q':
            current_map = map_reduce_hq
        elif 'r' <= lower_letter <= 'z':
            current_map = map_reduce_rz
        posting[term] = current_map.read_from(term.lower())
    print('Done building posting file')
    print('-------------------------------------')
    print('Get relevant Doc')
    relevant_docs = searcher.relevant_docs_from_posting(query_as_list, posting)
    print('Done getting relevant Doc')
    print('-------------------------------------')
    print('Start ranking docs')
    ranked_docs = searcher.ranker.rank_relevant_doc(
        relevant_docs, dictFromQuery, posting, map_reduce_ag, map_reduce_hq,
        map_reduce_rz, map_reduce_other, num_docs_to_retrieve)
    print('Done ranking docs')
    return searcher.ranker.retrieve_top_k(ranked_docs, num_docs_to_retrieve)
Esempio n. 4
0
 def __init__(self):
     self.map_reduce_ag = MapReduce.import_map_reduce('MapReduceData/AG/')
     self.map_reduce_hq = MapReduce.import_map_reduce('MapReduceData/HQ/')
     self.map_reduce_rz = MapReduce.import_map_reduce('MapReduceData/RZ/')
     self.map_reduce_other = MapReduce.import_map_reduce('MapReduceData/Others/')