def test_map_reduce_with_less_keys_than_workers(self): map_reduce = MapReduce(num_workers=4) values = ['key_1', 'key_2', 'key_1'] expected_result = [('key_1', 2), ('key_2', 1)] self.assertCountEqual(expected_result, map_reduce.map_reduce(values, mapper, reducer))
def test_no_interaction_between_map_reduce_runs(self): map_reduce = MapReduce(num_workers=4) values = ['val_1', 'val_2', 'val_1'] expected_result = [('val_1', 2), ('val_2', 1)] self.assertCountEqual(expected_result, map_reduce.map_reduce(values, mapper, reducer)) values = ['key_1', 'key_2', 'key_3', 'key_1'] expected_result = [('key_1', 2), ('key_2', 1), ('key_3', 1)] self.assertCountEqual(expected_result, map_reduce.map_reduce(values, mapper, reducer))
def __init__(self, input_dir, output_dir, num_mappers, num_reducers, clean_splited_data=True): self.input_dir = input_dir self.output_dir = output_dir self.num_mappers = num_mappers self.num_reducers = num_reducers self.clean_splited_data = clean_splited_data self.clean = clean_splited_data self.map_reduce = MapReduce() self.file_handler = FileHandler(input_dir, output_dir) self.file_handler.split_file(self.num_mappers)
def process_multi_cores(self, filenames): """ Process the file in parallel with multiple processes. This is a little bit different than the approach described in the LogMine paper. Each "map job" is a chunk of multiple lines (instead of a single line), this helps utilizing multiprocessing better. Do note that this method may return different result in each run, and different with the other version "process_single_core". This is expected, as the result depends on the processing order - which is not guaranteed when tasks are performed in parallel. """ segments = self.segmentator.create_segments(filenames) # Perform clustering all chunks in parallel mapper = MapReduce(map_segments_to_clusters, reduce_clusters, params=self.cluster_config) result = mapper(segments) if len(result) == 0: return [] (key, clusters) = result[0] return clusters
def top_similars_mapreduce(items, x, n=5, similarity=pearson_sim): """ Returns the best matches for x from the items. Number of results and similarity function are optional params. """ mapper = MapReduce(top_similars_map, top_similars_reduce) scores = mapper([(items, x, i, similarity) for i in range(4)]) # Sort the list so the highest scores appear at the top scores.sort() scores.reverse() return scores[:n]
def get_recommendations_user_filtred_mapreduce(items, x, n=5, similarity=pearson_sim): """ Returns recommendationx for x from the items, based on items from similar users """ mapper = MapReduce(get_recommendations_user_filtred_map, get_recommendations_user_filtred_reduce) scores = mapper([(items, x, i, similarity) for i in range(4)]) # Divide each total score by total weighting to get an average rankings = [(sim_x_score / sim, item) for (item, sim, sim_x_score) in scores] rankings.sort() rankings.reverse() return rankings[:n]
def page_rank(matrix, taxation=False, b=1, Es=[], S=set(), nbr_iterations=10000000, verbose=False): """ calculate the page rank for each element based on the matrix in input we should validate if the matrix is stochastic if not we use the taxation method to ovoid dead ends (introducing the random surfers) v' = Mv + (1-b)e/n v : eigenvector The term (1-b)e/n is a vector each of whose components has value (1-b)/n and represents the introduction, with probability 1 - b, of a new random surfer at a random page. The mathematical formulation for the iteration that yields topic-sensitive PageRank is similar to the equation we used for general PageRank. The only difference is how we add the new surfers. Suppose S is a set of integers consisting of the row/column numbers for the pages we have identified as belonging to a certain topic (called the teleport set). Let eS be a vector that has 1 in the components in S and 0 in other components. Then the topic-sensitive Page- Rank for S is the limit of the iteration v' = bMv + (1 - b)eS/|S| Here, as usual, M is the transition matrix of the Web, and |S| is the size of set S. """ elements_length = len(matrix[0]) eigenvectors = [1 / elements_length] * elements_length if Es and taxation: taxation_v = [((1 - b) / len(S) * e) for e in Es] else: taxation_v = [ (1 - b) / elements_length ] * elements_length if taxation else [0] * elements_length eigenvectors_p = [0] * elements_length itr = 0 # initializing map reduce mapper = MapReduce(page_rank_calculation, page_rank_vector) while eigenvectors_p != eigenvectors and itr < nbr_iterations: if eigenvectors_p != [0] * elements_length: eigenvectors = list(eigenvectors_p) for k, v in mapper([(i, eigenvectors, matrix, taxation_v, b) for i in range(elements_length)]): eigenvectors_p[k] = v itr += 1 if verbose: print eigenvectors return eigenvectors
def main(): comm = MPI.COMM_WORLD nr_nodes = comm.Get_size() rank = comm.Get_rank() map_phase = 10 reduce_phase = 20 confirmation = 30 stop_phase = 100 master = 0 # master node if rank == master: # mapping phase file_name = 'application/output/adjacency_list.json' with open(file_name, 'r', encoding='utf-8') as infile: json_data = json.load(infile) keys = [] data_queue = [] for data in json_data: keys.append(data) for key in keys: for value in json_data[key]: data = {"k": key, "v": value} data_queue.append(data) # transmit data temp_data = transmit_data(data_queue, nr_nodes, comm, map_phase) while temp_data: temp_data = transmit_data(temp_data, nr_nodes, comm, map_phase) # reduction phase path = 'application/output/map/*.json' files = glob.glob(path) for file in files: with open(file, 'r', encoding='utf-8') as infile: json_data = json.load(infile) keys = [] data_queue = [] for data in json_data: keys.append(data) for key in keys: for value in json_data[key]: data = {"k": key, "v": value} data_queue.append(data) # transmit data temp_data = transmit_data(data_queue, nr_nodes, comm, reduce_phase) # while temp_data: # temp_data = transmit_data(temp_data, nr_nodes, comm, reduce_phase) # stopping phase for dest_rank in range(1, nr_nodes): data = {"k": "", "v": ""} comm.isend(data, dest=dest_rank, tag=stop_phase) print("[" + str(rank) + "] - TERMINATED ") # worker nodes else: is_terminated = False mr = MapReduce(rank) while not is_terminated: status = MPI.Status() data = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) tag = status.Get_tag() if tag == map_phase: key = data["k"] value = data["v"] mr.map(key, value) elif tag == reduce_phase: key = data["k"] value = data["v"] mr.reduce(key, value) elif tag == stop_phase: print("[" + str(rank) + "] - TERMINATED") is_terminated = True else: print("[" + str(rank) + "] - INCORRECT TAG RECEIVED") comm.isend(data, dest=master, tag=confirmation) mr.store_values()
return out def custom_reduce(data): d = dict() for key, value in data: if key not in d: d[key] = value else: d[key] += value out = [] for key in d: out.append((key, d[key])) return out ## Same calls mr_1 = MapReduce(verbose=False) mr_1.map_one(custom_map, values_1) mr_1.map_one(custom_map, values_2) mr_1.map_one(custom_map, values_3) print(mr_1.reduce(custom_reduce)) mr_2 = MapReduce(verbose=False) mr_2.map(custom_map, all_values, 3) print(mr_2.reduce(custom_reduce)) mr_3 = MapReduce(verbose=True) print(mr_3.map_reduce(custom_map, custom_reduce, all_values, 3))
def __init__(self,corpus,minFreq=1): MapReduce.__init__(self) self.corpus = corpus self.data = corpus.docs self.minFreq = minFreq
def __init__(self,corpus): MapReduce.__init__(self) self.corpus = corpus self.data = corpus.docs self.totalDocs = len(self.data)
def test_raises_when_non_callable_map_reduce(self): with self.assertRaises(ValueError): MapReduce().map_reduce([], 0, lambda x: x) with self.assertRaises(ValueError): MapReduce().map_reduce([], lambda x: x, 0)
def test_raises_when_values_not_list(self): with self.assertRaises(ValueError): MapReduce().map_reduce(0, lambda x: x, lambda x: x)
def __init__(self,corpus): MapReduce.__init__(self) self.corpus = corpus self.data = corpus.tweets self.totalTweets = len(self.data)
def test_raises_when_non_positive_num_workers(self): with self.assertRaises(ValueError): MapReduce(num_workers=0) with self.assertRaises(ValueError): MapReduce(num_workers=-1)
def probability_calculation(item): """Read a file and return a sequence of (word, occurances) values. """ print multiprocessing.current_process().name, 'calculating', item output = [] IN_CIRCLE = 0 for i in range(int(NBR_PER_WORKER)): x = numpy.random.randint(0, RADIUQ) y = numpy.random.randint(0, RADIUQ) if (numpy.sqrt(x**2 + y**2) < RADIUQ): IN_CIRCLE += 1 output.append(('pi', IN_CIRCLE)) return output def estimate_pi(item): """Convert the partitioned data for a word to a tuple containing the word and the number of occurances. """ key, occurances = item return (sum(occurances) / NBR_POINTS) * 4 if __name__ == '__main__': mapper = MapReduce(probability_calculation, estimate_pi) pi = mapper([i for i in range(NBR_WORKERS)]) print pi
def print_results(counts): top = counts[:10] longest = max(len(word) for word, count in top) for word, count in top: print('{}: {}'.format(word, count)) if __name__ == '__main__': import operator import glob input_files = glob.glob('text/*.txt') mapper = MapReduce(extract_ngrams, count_ngrams) bigram_counts = mapper(input_files) bigram_counts.sort(key=operator.itemgetter(1)) bigram_counts.reverse() if bigram_counts: print('Top 10 bigrams by frequency:') print_results(bigram_counts) """Results: Top 10 bigrams by frequency: the_world: 9 beauty_s: 7 thou_art: 6 that_thou: 5 if_thou: 5 to_be: 5
class MapReduceManager(object): def __init__(self, input_dir, output_dir, num_mappers, num_reducers, clean_splited_data=True): self.input_dir = input_dir self.output_dir = output_dir self.num_mappers = num_mappers self.num_reducers = num_reducers self.clean_splited_data = clean_splited_data self.clean = clean_splited_data self.map_reduce = MapReduce() self.file_handler = FileHandler(input_dir, output_dir) self.file_handler.split_file(self.num_mappers) def run_mapper(self, thread_id): key = None value = None with open(config.get_name_of_piece(thread_id), "r") as f: key = f.readline() value = f.read() if (self.clean_splited_data): os.unlink(config.get_name_of_piece(thread_id)) mapper_result = self.map_reduce.mapper(key, value) slice_index = 0 stride = len(mapper_result) // self.num_reducers for reducer_index in range(self.num_reducers): with open(config.get_map_file_name(thread_id, reducer_index), "w+") as f: json.dump([ (key, value) for (key, value) in mapper_result[slice_index:slice_index + stride] ], f) slice_index += stride def run_reducer(self, thread_id): key_value_dict = {} for mapper_index in range(self.num_mappers): curr_map_json = None with open(config.get_map_file_name(mapper_index, thread_id)) as f: curr_map_json = json.load(f) for (key, value) in curr_map_json: if key in key_value_dict: key_value_dict[key].append(value) else: key_value_dict[key] = [value] if self.clean_splited_data: os.unlink(config.get_map_file_name(mapper_index, thread_id)) result = [ self.map_reduce.reducer(key, key_value_dict[key]) for key in key_value_dict ] with open(config.get_reduce_result_file_name(thread_id), 'w+') as f: json.dump(result, f) def run(self): mappers = list() reducers = list() for thread_id in range(self.num_mappers): new_mapper = multiprocessing.Process(target=self.run_mapper, args=(thread_id, )) new_mapper.start() mappers.append(new_mapper) list(map(lambda x: x.join(), mappers)) for thread_id in range(self.num_reducers): new_reducer = multiprocessing.Process(target=self.run_reducer, args=(thread_id, )) new_reducer.start() reducers.append(new_reducer) list(map(lambda x: x.join(), reducers))
word = word.lower() if word.isalpha() and word not in STOP_WORDS: output.append((word, 1)) return output def words_frequency(item): """Convert the partitioned data for a word to a tuple containing the word and the number of occurances. """ word, occurances = item return (word, sum(occurances)) if __name__ == '__main__': import operator import glob input_files = glob.glob('./*.txt') # linux notation for directories mapper = MapReduce(map_words, words_frequency) word_counts = mapper(input_files) word_counts.sort(key=operator.itemgetter(1)) word_counts.reverse() print '\nTOP 20 Iems by frequency\n' top20 = word_counts[:20] longest = max(len(word) for word, count in top20) for word, count in top20: print '%-*s: %5s' % (longest + 1, word, count)
def __init__(self,corpus): MapReduce.__init__(self) self.corpus = corpus self.data = corpus.docs
# coding=utf-8 """ Main module """ import os from map_reduce import MapReduce import utils as util_functions if __name__ == '__main__': folder_path = raw_input("Calea catre director: ") numar_procese = input("Numarul de procese: ") if not os.path.exists('{}\\out_files'.format(folder_path)): os.makedirs('{}\\out_files'.format(folder_path)) if not os.path.exists('{}\\final_result'.format(folder_path)): os.makedirs('{}\\final_result'.format(folder_path)) list_of_files = util_functions.read_directory_content(folder_path) map_reduce_object = MapReduce(util_functions.file_to_words, util_functions.count_words, folder_path, numar_procese) print '--------------------------------------------------------------- Prima Etapa - Etapa de Mapare' map_reduce_object.map_reduce(list_of_files) map_reduce_object.write_final_result()
def test_map_reduce_with_empty_values(self): map_reduce = MapReduce(num_workers=2) values = [] self.assertEqual([], map_reduce.map_reduce(values, mapper, reducer))