def __init__(self, config, all_terms_dict): self.inverted_idx = all_terms_dict #self.postingDict = {} self.fileName = 'InvertedIndex' self.config = config # {term: [ordered list where appear : (file_id , lineNumber)]} self.thread_pool_size = 1 avg_ram = (psutil.virtual_memory().available // 5)//(self.thread_pool_size +1) path = 'MapReduceData/' self.avg_length =(avg_ram // sys.getsizeof((int(), str()))) // (8/10) # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size) self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size, path + 'AG/') self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size, path + 'HQ/') self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size, path + 'Rz/') self.map_reduce_other = MapReduce(self.avg_length, self.thread_pool_size, path + 'Others/') self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size, path + 'Document/') self.tmp_pos = {} # self.num_in_pos_tmp = 0 self.num_in_pos_ag_tmp = [0] self.num_in_pos_hq_tmp = [0] self.num_in_pos_rz_tmp = [0] self.num_in_pos_other_tmp = [0] self.num_in_pos_doc_other = [0] self.Entitys = {} self.tmp_pos_ag = {} self.tmp_pos_hq = {} self.tmp_pos_rz = {} self.tmp_pos_other = {} self.tmp_pos_doc = {} # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5) self.NUMBER_OF_PROCESSES = 5 self.set_is_writting = {}
def main(): print("Reading files...") input_files = os.listdir('Data') START_TIME = time() mapper = MapReduce(file_to_words, count_words) word_counts, MAPPING_TIME, REFORMATING_TIME, REDUCING_TIME = mapper( input_files) word_counts.sort(key=operator.itemgetter(1)) word_counts.reverse() print("\nTOP 20 WORDS BY FREQUENCY\n") top20 = word_counts[:20] longest = max(len(word) for word, count in top20) for word, count in top20: print('%-*s: %5s' % (longest + 1, word, count)) END_TIME = time() print("\nMapping time = {} s".format(MAPPING_TIME)) print("Reformatting time = {} s".format(REFORMATING_TIME)) print("Reducing time = {} s".format(REDUCING_TIME)) print("Total running time = {} s".format(END_TIME - START_TIME))
import sys from MapReduce import MapReduce width = 5 height = 5 mr = MapReduce() def mapper(record): name, row, col, value = record if name == 'a': for n in range(width): mr.emit_intermediate((row, n), record) else: for n in range(height): mr.emit_intermediate((n, col), record) def reducer(key, values): a = [v for v in values if v[0] == 'a'] b = [v for v in values if v[0] == 'b'] total = 0 for m in a: for n in b: if n[1] == m[2]: total = total + n[-1] * m[-1] mr.emit((key[0], key[1], total)) def main():
from MapReduce import MapReduce import itertools import sys map_reduce_obj = MapReduce() def mapper(record): map_reduce_obj.emit_intermediate(record[0], record[1]) map_reduce_obj.emit_intermediate(record[1], record[0]) def reducer(key, list_of_values): value_group = list(itertools.combinations(list_of_values, 2)) for value in value_group: value = list(value) value.sort() value.append(key) map_reduce_obj.emit(value) if __name__ == '__main__': input_data = open(sys.argv[1]) map_reduce_obj.execute(input_data, mapper, reducer)