def __init__(self, config, all_terms_dict): self.inverted_idx = all_terms_dict #self.postingDict = {} self.fileName = 'InvertedIndex' self.config = config # {term: [ordered list where appear : (file_id , lineNumber)]} self.thread_pool_size = 1 avg_ram = (psutil.virtual_memory().available // 5)//(self.thread_pool_size +1) path = 'MapReduceData/' self.avg_length =(avg_ram // sys.getsizeof((int(), str()))) // (8/10) # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size) self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size, path + 'AG/') self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size, path + 'HQ/') self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size, path + 'Rz/') self.map_reduce_other = MapReduce(self.avg_length, self.thread_pool_size, path + 'Others/') self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size, path + 'Document/') self.tmp_pos = {} # self.num_in_pos_tmp = 0 self.num_in_pos_ag_tmp = [0] self.num_in_pos_hq_tmp = [0] self.num_in_pos_rz_tmp = [0] self.num_in_pos_other_tmp = [0] self.num_in_pos_doc_other = [0] self.Entitys = {} self.tmp_pos_ag = {} self.tmp_pos_hq = {} self.tmp_pos_rz = {} self.tmp_pos_other = {} self.tmp_pos_doc = {} # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5) self.NUMBER_OF_PROCESSES = 5 self.set_is_writting = {}
def page_rank(self): """ 计算PR值,每次迭代都需要两次调用MapReduce。一次是计算悬挂网页PR值之和,一次 是计算所有网页的PR值 :return: self.graph,其中的PR值已经计算好 """ iteration = 1 # 迭代次数 change = 1 # 记录每轮迭代后的PR值变化情况,初始值为1保证至少有一次迭代 while change > self.min_delta: print("Iteration: " + str(iteration)) # 因为可能存在悬挂网页,所以才有下面这个dangling_list # dangling_list存放的是[所有悬挂网页的PR值之和] # dp表示所有悬挂网页的PR值之和 dangling_list = MapReduce.map_reduce(self.graph, self.ip_mapper, self.ip_reducer) if dangling_list: dp = dangling_list[0] else: dp = 0 # 因为MapReduce.map_reduce中要求的reducer只能有两个参数,而我们 # 需要传3个参数(多了一个所有悬挂网页的PR值之和,即dp),所以采用 # 下面的lambda表达式来达到目的 # new_pr为一个列表,元素为:(网页名,计算所得的PR值) new_pr = MapReduce.map_reduce(self.graph, self.pr_mapper, lambda x, y: self.pr_reducer_inter(x, y, dp)) # 计算此轮PR值的变化情况 change = sum([abs(new_pr[i][1] - self.graph[new_pr[i][0]][0]) for i in range(self.num_of_pages)]) print("Change: " + str(change)) # 更新PR值 for i in range(self.num_of_pages): self.graph[new_pr[i][0]][0] = new_pr[i][1] iteration += 1 return self.graph
def search_and_rank_query(query, inverted_index,num_docs_to_retrieve): p = Parse() dictFromQuery = {} p.tokenSplit(query, dictFromQuery) query_as_list = [*dictFromQuery] searcher = Searcher(inverted_index) #posting = utils.load_obj("posting") print('-------------------------------------') print('Start import mapReduce') map_reduce = MapReduce.import_map_reduce('MapReduceData/') print('Done importing mapReduce') posting = {} print('-------------------------------------') print('Start build posting file') for term in query_as_list: posting[term] = map_reduce.read_from(term) print('Done building posting file') print('-------------------------------------') print('Get relevant Doc') relevant_docs = searcher.relevant_docs_from_posting(query_as_list,posting) print('Done getting relevant Doc') print('-------------------------------------') print('Start ranking docs') ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs,dictFromQuery,posting,num_docs_to_retrieve) print('Done ranking docs') return searcher.ranker.retrieve_top_k(ranked_docs,num_docs_to_retrieve)
def create_c_of_doc(top_relevant_docs, dictFromQuery, posting): # load map reduce from file # relavent doc : # {num : [score,doc_tuple, {index}]} # c[term,term2] = sum[k](term1 in doc k * term2 in doc k) # = > {} queryAsList = [*dictFromQuery] map_reduce = MapReduce.import_map_reduce('MapReduceData/') c_matrix = {} # {term: {'other term' : value}} for doc_id in top_relevant_docs.keys(): if doc_id != 'META-DATA': info_list = map_reduce.read_from(('Document', doc_id)) doc_term_freq_dict = info_list max_freq = info_list[1] if len(doc_term_freq_dict) == 0: continue doc_term_freq_dict = doc_term_freq_dict[0] for term_doc1, term_doc_freq1 in doc_term_freq_dict.items(): #for queryIndex in top_relevant_docs[doc_id][2]: if term_doc1 not in c_matrix.keys(): c_matrix[term_doc1] = {} for term_doc2, term_doc_freq2 in doc_term_freq_dict.items( ): if term_doc1 in dictFromQuery.keys( ) or term_doc1 == term_doc2: if term_doc2 not in c_matrix[term_doc1]: c_matrix[term_doc1][term_doc2] = 0 c_matrix[term_doc1][ term_doc2] += term_doc_freq1 * term_doc_freq2 #Cii,Cjj,Cij return c_matrix
def main(): print("Reading files...") input_files = os.listdir('Data') START_TIME = time() mapper = MapReduce(file_to_words, count_words) word_counts, MAPPING_TIME, REFORMATING_TIME, REDUCING_TIME = mapper( input_files) word_counts.sort(key=operator.itemgetter(1)) word_counts.reverse() print("\nTOP 20 WORDS BY FREQUENCY\n") top20 = word_counts[:20] longest = max(len(word) for word, count in top20) for word, count in top20: print('%-*s: %5s' % (longest + 1, word, count)) END_TIME = time() print("\nMapping time = {} s".format(MAPPING_TIME)) print("Reformatting time = {} s".format(REFORMATING_TIME)) print("Reducing time = {} s".format(REDUCING_TIME)) print("Total running time = {} s".format(END_TIME - START_TIME))
def search_and_rank_query(query, inverted_index, num_docs_to_retrieve): p = Parse() dictFromQuery = {} map_reduce_ag = MapReduce.import_map_reduce('MapReduceData/AG/') map_reduce_hq = MapReduce.import_map_reduce('MapReduceData/HQ/') map_reduce_rz = MapReduce.import_map_reduce('MapReduceData/Rz/') map_reduce_other = MapReduce.import_map_reduce('MapReduceData/Others/') map_reduce_doc = MapReduce.import_map_reduce('MapReduceData/Document/') p.tokenSplit(query, dictFromQuery) query_as_list = [*dictFromQuery] searcher = Searcher(inverted_index) #posting = utils.load_obj("posting") print('-------------------------------------') print('Start import mapReduce') # map_reduce = MapReduce.import_map_reduce('MapReduceData/') print('Done importing mapReduce') posting = {} print('-------------------------------------') print('Start build posting file') query_as_list.sort(key=lambda x: x.lower()) for term in query_as_list: lower_letter = term[0].lower() current_map = map_reduce_other if 'a' <= lower_letter <= 'g': current_map = map_reduce_ag elif 'h' <= lower_letter <= 'q': current_map = map_reduce_hq elif 'r' <= lower_letter <= 'z': current_map = map_reduce_rz posting[term] = current_map.read_from(term.lower()) print('Done building posting file') print('-------------------------------------') print('Get relevant Doc') relevant_docs = searcher.relevant_docs_from_posting(query_as_list, posting) print('Done getting relevant Doc') print('-------------------------------------') print('Start ranking docs') ranked_docs = searcher.ranker.rank_relevant_doc( relevant_docs, dictFromQuery, posting, map_reduce_ag, map_reduce_hq, map_reduce_rz, map_reduce_other, num_docs_to_retrieve) print('Done ranking docs') return searcher.ranker.retrieve_top_k(ranked_docs, num_docs_to_retrieve)
__author__ = 'Chiru' import sys from MapReduce import MapReduce mr = MapReduce() #Mapper is called for every record in the data file def mapper(record): #Structure of record : every record has record_type in first field,order_id in the second field record_type = record[0] order_id = record[1] #Mapper will be called on all the orders first(As the data file contains the records of orders before line_items) #As the output of mapper is fed to reducer,all the output to be displayed need to be present in the output of the Mapper. #So emit_intermdiate all the records with order_id as the key #So for every record the id,record is emmited if record_type == "order": mr.emit_intermediate(order_id,record) elif record_type == "line_item": mr.emit_intermediate(order_id,record) #Reducer funtion is called for every record in the output of the map phase (here it is the global dictionary mr.intermediate) def reducer(key,list_of_values): #for all list_of_values first field will be order record and all the others will be line_item records #ie list_of_values[0] will be the orders and list_of_values[1:n] will be the list_item records #Every order is emitted with all the list_items having the same order ID current = 1; while current < len(list_of_values): mr.emit((list_of_values[0],list_of_values[current])) current = current + 1;
import sys from MapReduce import MapReduce width = 5 height = 5 mr = MapReduce() def mapper(record): name, row, col, value = record if name == 'a': for n in range(width): mr.emit_intermediate((row, n), record) else: for n in range(height): mr.emit_intermediate((n, col), record) def reducer(key, values): a = [v for v in values if v[0] == 'a'] b = [v for v in values if v[0] == 'b'] total = 0 for m in a: for n in b: if n[1] == m[2]: total = total + n[-1] * m[-1] mr.emit((key[0], key[1], total)) def main():
from MapReduce import MapReduce mr = MapReduce() def mapper(record): # key: word # value: filename value = record[0] text = record[1] for key in text.split(): mr.emit_intermediate(key, value) def reducer(key, values): # key: word # value: list of filenames mr.emit((key, list(set(values)))) if __name__ == '__main__': import sys, json inputdata = open(sys.argv[1]) # inputdata = open('./data/books.json') mr.execute(inputdata, mapper, reducer) with open('inverted_index.json', 'w') as outfile: json.dump(mr.result, outfile)
import sys import json from math import ceil from MapReduce import MapReduce map_reduce_obj = MapReduce() def mapper(record): number_of_baskets=len(record) candidate_list=open(sys.argv[2]) for candidate in candidate_list: count=0 candidate=json.loads(candidate.strip()) for candidate_chunk in record: if not set(candidate)-set(candidate_chunk): count+=1 map_reduce_obj.emit_intermediate(tuple(candidate),(count,number_of_baskets)) def reduce(key,list_of_value): total_count=0 total_baskets=0 for item in list_of_value: total_count+=item[0] total_baskets+=item[1] threshold=ceil(total_baskets*0.3) if total_count>=threshold: map_reduce_obj.emit([list(key),total_count])
import sys from MapReduce import MapReduce mr = MapReduce() def mapper(record): # key: document identifier # value: document contents key, seq = record mr.emit_intermediate(seq[:-10], 1) # Part 3 def reducer(key, list_of_values): # key: word # value: list of occurrence counts mr.emit(key) # Part 4 with open(sys.argv[1]) as f: mr.execute(f, mapper, reducer)
from MapperMatrixVector import MapperMatrixVector from ReducerMatrixVector import ReducerMatrixVector from MapReduce import MapReduce from FileHelper import FileHelper # Create instances for mapper and reducer # Note that the vector is stored in the instance theReducerMatrixVector = ReducerMatrixVector(); theMapperMatrixVector = MapperMatrixVector('dataFiles/b'); # the file where the matrix is stored matrixFile = ['dataFiles/A']; # MapReduce theMapReducerMatrixVector = MapReduce(theMapperMatrixVector,theReducerMatrixVector,matrixFile,0,1) resultDict = theMapReducerMatrixVector.execute(); # Write output outFileFirectory = 'outputs/' outfileName = 'matrixVectorResults.txt'; FileHelper.writeDictionnary(outFileFirectory+outfileName,resultDict)
class Indexer: def __init__(self, config, all_terms_dict): self.inverted_idx = all_terms_dict #self.postingDict = {} self.fileName = 'InvertedIndex' self.config = config # {term: [ordered list where appear : (file_id , lineNumber)]} self.thread_pool_size = 1 avg_ram = (psutil.virtual_memory().available // 10) // (self.thread_pool_size + 1) path = 'MapReduceData/' self.avg_length = (avg_ram // sys.getsizeof( (int(), str()))) // (8 / 10) # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size) self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size, path + 'AG/') self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size, path + 'HQ/') self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size, path + 'Rz/') self.map_reduce_other = MapReduce(self.avg_length, self.thread_pool_size, path + 'Others/') self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size, path + 'Document/') # self.tmp_pos = {} # self.num_in_pos_tmp = 0 self.num_in_pos_ag_tmp = [0] self.num_in_pos_hq_tmp = [0] self.num_in_pos_rz_tmp = [0] self.num_in_pos_other_tmp = [0] self.num_in_pos_doc = [0] self.Entitys = {} self.tmp_pos_ag = {} self.tmp_pos_hq = {} self.tmp_pos_rz = {} self.tmp_pos_other = {} self.tmp_pos_doc = {} # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5) self.NUMBER_OF_PROCESSES = 5 self.set_is_writting = {} def get_right_tmp_pos_and_num(self, first_letter): lower_letter = str(first_letter).lower() if 'a' <= lower_letter <= 'g': return [ self.tmp_pos_ag, self.num_in_pos_ag_tmp, self.map_reduce_ag, 'ag' ] elif 'h' <= lower_letter <= 'q': return [ self.tmp_pos_hq, self.num_in_pos_hq_tmp, self.map_reduce_hq, 'hq' ] elif 'r' <= lower_letter <= 'z': return [ self.tmp_pos_rz, self.num_in_pos_rz_tmp, self.map_reduce_rz, 'rz' ] return [ self.tmp_pos_other, self.num_in_pos_other_tmp, self.map_reduce_other, 'others' ] def save_left_over(self, dict, map_reduce): map_reduce.write_dict(dict) map_reduce.wait_untill_finish() def save_all_map_reduce(self): with ProcessPoolExecutor() as process_exector: process_exector.map(self.map_reduce_ag.save_map_reduce()) process_exector.map(self.map_reduce_hq.save_map_reduce()) process_exector.map(self.map_reduce_rz.save_map_reduce()) process_exector.map(self.map_reduce_other.save_map_reduce()) process_exector.map(self.map_reduce_doc.save_map_reduce()) def check_save_left_over_ag(self): if self.num_in_pos_ag_tmp[0] > 0: self.save_left_over(self.tmp_pos_ag, self.map_reduce_ag) self.num_in_pos_ag_tmp[0] = 0 self.map_reduce_ag.wait_untill_finish() def check_save_left_over_hq(self): if self.num_in_pos_hq_tmp[0] > 0: self.save_left_over(self.tmp_pos_hq, self.map_reduce_hq) self.num_in_pos_hq_tmp[0] = 0 self.map_reduce_hq.wait_untill_finish() def check_save_left_over_rz(self): if self.num_in_pos_rz_tmp[0] > 0: self.save_left_over(self.tmp_pos_rz, self.map_reduce_rz) self.num_in_pos_rz_tmp[0] = 0 self.map_reduce_rz.wait_untill_finish() def check_save_left_over_others(self): if self.num_in_pos_other_tmp[0] > 0: self.save_left_over(self.tmp_pos_other, self.map_reduce_other) self.num_in_pos_other_tmp[0] = 0 self.map_reduce_other.wait_untill_finish() def check_save_left_over_doc(self): if self.num_in_pos_doc[0] > 0: self.save_left_over(self.tmp_pos_doc, self.map_reduce_doc) self.num_in_pos_doc[0] = 0 self.map_reduce_doc.wait_untill_finish() def save_all_left_overs(self): with ProcessPoolExecutor() as process_exector: process_exector.map(self.check_save_left_over_ag()) process_exector.map(self.check_save_left_over_hq()) process_exector.map(self.check_save_left_over_rz()) process_exector.map(self.check_save_left_over_others()) process_exector.map(self.check_save_left_over_doc()) def print_meta_data_len(self): print('________________________________________________') print('Ag:' + str(len(self.map_reduce_ag.meta_data))) print('HG:' + str(len(self.map_reduce_hq.meta_data))) print('RZ:' + str(len(self.map_reduce_rz.meta_data))) print('OTHER:' + str(len(self.map_reduce_other.meta_data))) print('Doc:' + str(len(self.map_reduce_doc.meta_data))) print('________________________________________________') # return len(self.map_reduce_ag.meta_data) + len(self.map_reduce_hq.meta_data) + len(self.map_reduce_rz.meta_data) + len(self.map_reduce_other.meta_data) + len(self.map_reduce_ag.meta_data) def addEntitysToPosting(self, term, tweet_id, quantity): str_term = str(term) first_letter = str_term[0] tmp_pos, number_arr, map_reduce, _ = self.get_right_tmp_pos_and_num( first_letter) # first time seeing post (might be in) if term.upper() not in self.Entitys.keys() and term.upper( ) not in tmp_pos.keys() and term.lower( ) not in map_reduce.meta_data.keys(): self.Entitys[term.upper()] = (tweet_id, quantity) else: if term.upper() not in tmp_pos.keys(): tmp_pos[term.upper()] = [] if term.upper() in self.Entitys.keys(): # secound time seeing it self.inverted_idx[term.upper()] = 2 tmp_pos[term.upper()].append(self.Entitys[term.upper()]) tmp_pos[term.upper()].append((tweet_id, quantity)) number_arr[0] += 2 self.inverted_idx[term.upper()] = 2 else: self.inverted_idx[term.upper()] += 1 tmp_pos[term.upper()].append((tweet_id, quantity)) number_arr[0] += 1 def add_new_doc(self, document): """ This function perform indexing process for a document object. Saved information is captures via two dictionaries ('inverted index' and 'posting') :param document: a document need to be indexed. :return: - """ document_dictionary = document.term_doc_dictionary #{term:freq,term:freq} term_lst = [*document_dictionary] term_lst.sort(key=lambda x: x.lower()) for i in range(len(term_lst)): term = term_lst[i] tmp_pos, number_arr, map_reduce, key = self.get_right_tmp_pos_and_num( term[0]) try: if term[0].isupper() and " " in term: self.addEntitysToPosting(term, document.tweet_id, document_dictionary[term]) continue if number_arr[0] >= self.avg_length: map_reduce.write_dict(tmp_pos) self.set_is_writting[key] = 1 number_arr[0] = 0 if key in self.set_is_writting.keys(): print('Waiting to write to end ') map_reduce.wait_untill_finish() print('Done waiting') del self.set_is_writting[key] if term.lower() not in tmp_pos.keys(): tmp_pos[term.lower()] = [] tmp_pos[term.lower()].append( (document.tweet_id, document_dictionary[term])) number_arr[0] += 1 except: print('INVERTED: problem with the following key {}'.format( term[0])) max_freq = max([document_dictionary.values()]) self.tmp_pos_doc[document.tweet_id] = document_dictionary self.num_in_pos_doc[0] += 1 if self.num_in_pos_doc[0] >= self.avg_length: if 'doc' not in self.set_is_writting.keys(): self.map_reduce_doc.write_dict(self.tmp_pos_doc) self.set_is_writting['doc'] = 1 else: self.map_reduce_doc.wait_untill_finish() del self.set_is_writting['doc'] self.num_in_pos_doc[0] = 0
from MapReduce import MapReduce import itertools import sys map_reduce_obj = MapReduce() def mapper(record): map_reduce_obj.emit_intermediate(record[0], record[1]) map_reduce_obj.emit_intermediate(record[1], record[0]) def reducer(key, list_of_values): value_group = list(itertools.combinations(list_of_values, 2)) for value in value_group: value = list(value) value.sort() value.append(key) map_reduce_obj.emit(value) if __name__ == '__main__': input_data = open(sys.argv[1]) map_reduce_obj.execute(input_data, mapper, reducer)
def __init__(self): self.map_reduce_ag = MapReduce.import_map_reduce('MapReduceData/AG/') self.map_reduce_hq = MapReduce.import_map_reduce('MapReduceData/HQ/') self.map_reduce_rz = MapReduce.import_map_reduce('MapReduceData/RZ/') self.map_reduce_other = MapReduce.import_map_reduce('MapReduceData/Others/')
class Indexer: def __init__(self, config, all_terms_dict): self.inverted_idx = all_terms_dict #self.postingDict = {} self.fileName = 'InvertedIndex' self.config = config # {term: [ordered list where appear : (file_id , lineNumber)]} self.thread_pool_size = 2 avg_ram = (psutil.virtual_memory().available // self.thread_pool_size) // 10 path = 'MapReduceData/' self.avg_length = (avg_ram // sys.getsizeof( (int(), str()))) // (8 / 10) # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size) self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size, path + 'AG/') self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size, path + 'HQ/') self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size, path + 'Rz/') self.map_reduce_other = MapReduce(self.avg_length, self.thread_pool_size, path + 'Others/') self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size, path + 'Document/') self.tmp_pos = {} # self.num_in_pos_tmp = 0 self.num_in_pos_ag_tmp = [0] self.num_in_pos_hq_tmp = [0] self.num_in_pos_rz_tmp = [0] self.num_in_pos_other_tmp = [0] self.num_in_pos_doc_other = [0] self.Entitys = {} self.tmp_pos_ag = {} self.tmp_pos_hq = {} self.tmp_pos_rz = {} self.tmp_pos_other = {} self.tmp_pos_doc = {} # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5) self.NUMBER_OF_PROCESSES = 5 def get_right_tmp_pos_and_num(self, first_letter): lower_letter = str(first_letter).lower() if 'a' <= lower_letter <= 'g': return [ self.tmp_pos_ag, self.num_in_pos_ag_tmp, self.map_reduce_ag ] elif 'h' <= lower_letter <= 'q': return [ self.tmp_pos_hq, self.num_in_pos_hq_tmp, self.map_reduce_hq ] elif 'r' <= lower_letter <= 'z': return [ self.tmp_pos_rz, self.num_in_pos_rz_tmp, self.map_reduce_rz ] return [ self.tmp_pos_other, self.num_in_pos_other_tmp, self.map_reduce_other ] def wait_untill_all_finish(self): self.map_reduce_ag.wait_untill_finish() self.map_reduce_hq.wait_untill_finish() self.map_reduce_rz.wait_untill_finish() self.map_reduce_other.wait_untill_finish() self.map_reduce_doc.wait_untill_finish() def save_left_over(self, dict, map_reduce): map_reduce.write_dict_func(dict) def check_save_left_over_ag(self): if self.num_in_pos_ag_tmp[0] > 0: self.save_left_over(self.tmp_pos_ag, self.map_reduce_ag) self.num_in_pos_ag_tmp[0] = 0 def check_save_left_over_hq(self): if self.num_in_pos_hq_tmp[0] > 0: self.save_left_over(self.tmp_pos_hq, self.map_reduce_hq) self.num_in_pos_hq_tmp[0] = 0 def check_save_left_over_rz(self): if self.num_in_pos_rz_tmp[0] > 0: self.save_left_over(self.tmp_pos_rz, self.map_reduce_rz) self.num_in_pos_rz_tmp[0] = 0 def check_save_left_over_others(self): if self.num_in_pos_other_tmp[0] > 0: self.save_left_over(self.tmp_pos_other, self.map_reduce_other) self.num_in_pos_other_tmp[0] = 0 def check_save_left_over_doc(self): if self.num_in_pos_doc_other[0] > 0: self.save_left_over(self.tmp_pos_doc, self.map_reduce_doc) self.num_in_pos_doc_other[0] = 0 def save_all_left_overs(self): # self.check_save_left_over_ag() # self.check_save_left_over_doc() # self.check_save_left_over_hq() # self.check_save_left_over_rz() # self.check_save_left_over_others() with ProcessPoolExecutor() as process_exector: process_exector.submit(self.check_save_left_over_ag()) process_exector.submit(self.check_save_left_over_hq()) process_exector.submit(self.check_save_left_over_rz()) process_exector.submit(self.check_save_left_over_others()) process_exector.submit(self.check_save_left_over_doc()) def add_entitys_to_posting(self, term, tweet_id, quantity): first_letter = term[0] tmp_pos, number_arr, _ = self.get_right_tmp_pos_and_num(first_letter) if term.upper() not in self.Entitys.keys() and term.upper( ) not in tmp_pos.keys(): self.Entitys[term.upper()] = (tweet_id, quantity) else: if term.upper() not in self.inverted_idx.keys(): self.inverted_idx[term.upper()] = 2 else: self.inverted_idx[term.upper()] += 1 if term.upper() not in tmp_pos.keys(): tmp_pos[term.upper()] = [] tmp_pos[term.upper()].append(self.Entitys[term.upper()]) del self.Entitys[term.upper()] tmp_pos[term.upper()].append((tweet_id, quantity)) def add_new_doc(self, document): """ This function perform indexing process for a document object. Saved information is captures via two dictionaries ('inverted index' and 'posting') :param document: a document need to be indexed. :return: - """ document_dictionary = document.term_doc_dictionary #{term:freq,term:freq} term_lst = [*document_dictionary] term_lst.sort(key=lambda x: x.lower()) for i in range(len(term_lst)): term = term_lst[i] tmp_pos, number_arr, map_reduce = self.get_right_tmp_pos_and_num( term[0]) try: if term[0].isupper() and " " in term: self.add_entitys_to_posting(term, document.tweet_id, document_dictionary[term]) continue if number_arr[0] >= self.avg_length: map_reduce.write_dict_func(tmp_pos) number_arr[0] = 0 if term.lower() not in self.tmp_pos.keys(): tmp_pos[term.lower()] = [] tmp_pos[term.lower()].append( (document.tweet_id, document_dictionary[term])) number_arr[0] += 1 except: print('problem with the following key {}'.format(term[0])) max_freq = max([document_dictionary.values()]) self.tmp_pos_doc[document.tweet_id] = document_dictionary self.num_in_pos_doc_other[0] += 1 if self.num_in_pos_doc_other[0] >= self.avg_length: self.map_reduce_doc.write_dict_func(self.tmp_pos_doc) self.num_in_pos_doc_other[0] = 0
from MapperCountingWords import MapperCountingWords from ReducerCountingWords import ReducerCountingWords from MapReduce import MapReduce from FileHelper import FileHelper # Create instances for mapper and reducer theMapper = MapperCountingWords(); theReducer = ReducerCountingWords(); # parse the file : one word/line inFiles = ['dataFiles/text']; # we can have more than one text file inFileParsed = 'dataFiles/textParsed'; FileHelper.transformTextIntoListOfWords(inFiles,inFileParsed) # MapReduce theMapReducer = MapReduce(theMapper,theReducer,[inFileParsed],silent=-1,nThreads=5) resultDict = theMapReducer.execute() # Write output outFileFirectory = 'outputs/' outfileName = 'coutingWordsResults.txt'; FileHelper.writeDictionnary(outFileFirectory+outfileName,resultDict)