def search_and_rank_query(query, inverted_index,num_docs_to_retrieve): p = Parse() dictFromQuery = {} p.tokenSplit(query, dictFromQuery) query_as_list = [*dictFromQuery] searcher = Searcher(inverted_index) #posting = utils.load_obj("posting") print('-------------------------------------') print('Start import mapReduce') map_reduce = MapReduce.import_map_reduce('MapReduceData/') print('Done importing mapReduce') posting = {} print('-------------------------------------') print('Start build posting file') for term in query_as_list: posting[term] = map_reduce.read_from(term) print('Done building posting file') print('-------------------------------------') print('Get relevant Doc') relevant_docs = searcher.relevant_docs_from_posting(query_as_list,posting) print('Done getting relevant Doc') print('-------------------------------------') print('Start ranking docs') ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs,dictFromQuery,posting,num_docs_to_retrieve) print('Done ranking docs') return searcher.ranker.retrieve_top_k(ranked_docs,num_docs_to_retrieve)
def create_c_of_doc(top_relevant_docs, dictFromQuery, posting): # load map reduce from file # relavent doc : # {num : [score,doc_tuple, {index}]} # c[term,term2] = sum[k](term1 in doc k * term2 in doc k) # = > {} queryAsList = [*dictFromQuery] map_reduce = MapReduce.import_map_reduce('MapReduceData/') c_matrix = {} # {term: {'other term' : value}} for doc_id in top_relevant_docs.keys(): if doc_id != 'META-DATA': info_list = map_reduce.read_from(('Document', doc_id)) doc_term_freq_dict = info_list max_freq = info_list[1] if len(doc_term_freq_dict) == 0: continue doc_term_freq_dict = doc_term_freq_dict[0] for term_doc1, term_doc_freq1 in doc_term_freq_dict.items(): #for queryIndex in top_relevant_docs[doc_id][2]: if term_doc1 not in c_matrix.keys(): c_matrix[term_doc1] = {} for term_doc2, term_doc_freq2 in doc_term_freq_dict.items( ): if term_doc1 in dictFromQuery.keys( ) or term_doc1 == term_doc2: if term_doc2 not in c_matrix[term_doc1]: c_matrix[term_doc1][term_doc2] = 0 c_matrix[term_doc1][ term_doc2] += term_doc_freq1 * term_doc_freq2 #Cii,Cjj,Cij return c_matrix
def search_and_rank_query(query, inverted_index, num_docs_to_retrieve): p = Parse() dictFromQuery = {} map_reduce_ag = MapReduce.import_map_reduce('MapReduceData/AG/') map_reduce_hq = MapReduce.import_map_reduce('MapReduceData/HQ/') map_reduce_rz = MapReduce.import_map_reduce('MapReduceData/Rz/') map_reduce_other = MapReduce.import_map_reduce('MapReduceData/Others/') map_reduce_doc = MapReduce.import_map_reduce('MapReduceData/Document/') p.tokenSplit(query, dictFromQuery) query_as_list = [*dictFromQuery] searcher = Searcher(inverted_index) #posting = utils.load_obj("posting") print('-------------------------------------') print('Start import mapReduce') # map_reduce = MapReduce.import_map_reduce('MapReduceData/') print('Done importing mapReduce') posting = {} print('-------------------------------------') print('Start build posting file') query_as_list.sort(key=lambda x: x.lower()) for term in query_as_list: lower_letter = term[0].lower() current_map = map_reduce_other if 'a' <= lower_letter <= 'g': current_map = map_reduce_ag elif 'h' <= lower_letter <= 'q': current_map = map_reduce_hq elif 'r' <= lower_letter <= 'z': current_map = map_reduce_rz posting[term] = current_map.read_from(term.lower()) print('Done building posting file') print('-------------------------------------') print('Get relevant Doc') relevant_docs = searcher.relevant_docs_from_posting(query_as_list, posting) print('Done getting relevant Doc') print('-------------------------------------') print('Start ranking docs') ranked_docs = searcher.ranker.rank_relevant_doc( relevant_docs, dictFromQuery, posting, map_reduce_ag, map_reduce_hq, map_reduce_rz, map_reduce_other, num_docs_to_retrieve) print('Done ranking docs') return searcher.ranker.retrieve_top_k(ranked_docs, num_docs_to_retrieve)
def __init__(self): self.map_reduce_ag = MapReduce.import_map_reduce('MapReduceData/AG/') self.map_reduce_hq = MapReduce.import_map_reduce('MapReduceData/HQ/') self.map_reduce_rz = MapReduce.import_map_reduce('MapReduceData/RZ/') self.map_reduce_other = MapReduce.import_map_reduce('MapReduceData/Others/')