Esempio n. 1
0
    def test_map_reduce_with_less_keys_than_workers(self):
        map_reduce = MapReduce(num_workers=4)
        values = ['key_1', 'key_2', 'key_1']

        expected_result = [('key_1', 2), ('key_2', 1)]

        self.assertCountEqual(expected_result,
                              map_reduce.map_reduce(values, mapper, reducer))
Esempio n. 2
0
    def test_no_interaction_between_map_reduce_runs(self):
        map_reduce = MapReduce(num_workers=4)

        values = ['val_1', 'val_2', 'val_1']
        expected_result = [('val_1', 2), ('val_2', 1)]

        self.assertCountEqual(expected_result,
                              map_reduce.map_reduce(values, mapper, reducer))

        values = ['key_1', 'key_2', 'key_3', 'key_1']
        expected_result = [('key_1', 2), ('key_2', 1), ('key_3', 1)]

        self.assertCountEqual(expected_result,
                              map_reduce.map_reduce(values, mapper, reducer))
Esempio n. 3
0
 def __init__(self,
              input_dir,
              output_dir,
              num_mappers,
              num_reducers,
              clean_splited_data=True):
     self.input_dir = input_dir
     self.output_dir = output_dir
     self.num_mappers = num_mappers
     self.num_reducers = num_reducers
     self.clean_splited_data = clean_splited_data
     self.clean = clean_splited_data
     self.map_reduce = MapReduce()
     self.file_handler = FileHandler(input_dir, output_dir)
     self.file_handler.split_file(self.num_mappers)
Esempio n. 4
0
    def process_multi_cores(self, filenames):
        """
        Process the file in parallel with multiple processes.

        This is a little bit different than the approach described in the
        LogMine paper. Each "map job" is a chunk of multiple lines (instead of
        a single line), this helps utilizing multiprocessing better.

        Do note that this method may return different result in each run, and
        different with the other version "process_single_core". This is
        expected, as the result depends on the processing order - which is
        not guaranteed when tasks are performed in parallel.
        """
        segments = self.segmentator.create_segments(filenames)

        # Perform clustering all chunks in parallel
        mapper = MapReduce(map_segments_to_clusters,
                           reduce_clusters,
                           params=self.cluster_config)
        result = mapper(segments)

        if len(result) == 0:
            return []

        (key, clusters) = result[0]
        return clusters
Esempio n. 5
0
def top_similars_mapreduce(items, x, n=5, similarity=pearson_sim):
    """
    Returns the best matches for x from the items.
    Number of results and similarity function are optional params.
    """
    mapper = MapReduce(top_similars_map, top_similars_reduce)
    scores = mapper([(items, x, i, similarity) for i in range(4)])
    # Sort the list so the highest scores appear at the top
    scores.sort()
    scores.reverse()
    return scores[:n]
Esempio n. 6
0
def get_recommendations_user_filtred_mapreduce(items, x, n=5, similarity=pearson_sim):
    """
    Returns recommendationx for x from the items, based on items from similar users     
    """
    mapper = MapReduce(get_recommendations_user_filtred_map, get_recommendations_user_filtred_reduce)
    scores = mapper([(items, x, i, similarity) for i in range(4)])
    # Divide each total score by total weighting to get an average
    rankings = [(sim_x_score / sim, item) for (item, sim, sim_x_score) in scores]
    rankings.sort()
    rankings.reverse()
    return rankings[:n]
Esempio n. 7
0
def page_rank(matrix,
              taxation=False,
              b=1,
              Es=[],
              S=set(),
              nbr_iterations=10000000,
              verbose=False):
    """
        calculate the page rank for each element based on the matrix in input
        we should validate if the matrix is stochastic
        if not we use the taxation method to ovoid dead ends (introducing the random surfers)
            v' = Mv + (1-b)e/n
            v : eigenvector
            The term (1-b)e/n is a vector each of whose components has value (1-b)/n and
            represents the introduction, with probability 1 - b, of a new random surfer at
            a random page.
        The mathematical formulation for the iteration that yields topic-sensitive
        PageRank is similar to the equation we used for general PageRank. The only
        difference is how we add the new surfers. Suppose S is a set of integers consisting
        of the row/column numbers for the pages we have identified as belonging to a
        certain topic (called the teleport set). Let eS be a vector that has 1 in the
        components in S and 0 in other components. Then the topic-sensitive Page-
        Rank for S is the limit of the iteration
            v' = bMv + (1 - b)eS/|S|
        Here, as usual, M is the transition matrix of the Web, and |S| is the size of set
        S.
    """
    elements_length = len(matrix[0])
    eigenvectors = [1 / elements_length] * elements_length
    if Es and taxation:
        taxation_v = [((1 - b) / len(S) * e) for e in Es]
    else:
        taxation_v = [
            (1 - b) / elements_length
        ] * elements_length if taxation else [0] * elements_length

    eigenvectors_p = [0] * elements_length
    itr = 0
    # initializing map reduce
    mapper = MapReduce(page_rank_calculation, page_rank_vector)
    while eigenvectors_p != eigenvectors and itr < nbr_iterations:
        if eigenvectors_p != [0] * elements_length:
            eigenvectors = list(eigenvectors_p)
        for k, v in mapper([(i, eigenvectors, matrix, taxation_v, b)
                            for i in range(elements_length)]):
            eigenvectors_p[k] = v
        itr += 1
    if verbose: print eigenvectors
    return eigenvectors
Esempio n. 8
0
def main():
    comm = MPI.COMM_WORLD
    nr_nodes = comm.Get_size()
    rank = comm.Get_rank()
    map_phase = 10
    reduce_phase = 20
    confirmation = 30
    stop_phase = 100
    master = 0

    # master node
    if rank == master:

        # mapping phase
        file_name = 'application/output/adjacency_list.json'
        with open(file_name, 'r', encoding='utf-8') as infile:
            json_data = json.load(infile)

        keys = []
        data_queue = []

        for data in json_data:
            keys.append(data)

        for key in keys:
            for value in json_data[key]:
                data = {"k": key, "v": value}
                data_queue.append(data)

        # transmit data
        temp_data = transmit_data(data_queue, nr_nodes, comm, map_phase)
        while temp_data:
            temp_data = transmit_data(temp_data, nr_nodes, comm, map_phase)

        # reduction phase
        path = 'application/output/map/*.json'
        files = glob.glob(path)
        for file in files:
            with open(file, 'r', encoding='utf-8') as infile:
                json_data = json.load(infile)

            keys = []
            data_queue = []

            for data in json_data:
                keys.append(data)

            for key in keys:
                for value in json_data[key]:
                    data = {"k": key, "v": value}
                    data_queue.append(data)

            # transmit data
            temp_data = transmit_data(data_queue, nr_nodes, comm, reduce_phase)
            # while temp_data:
            #    temp_data = transmit_data(temp_data, nr_nodes, comm, reduce_phase)

        # stopping phase
        for dest_rank in range(1, nr_nodes):
            data = {"k": "", "v": ""}
            comm.isend(data, dest=dest_rank, tag=stop_phase)

        print("[" + str(rank) + "] - TERMINATED ")

    # worker nodes
    else:
        is_terminated = False
        mr = MapReduce(rank)

        while not is_terminated:
            status = MPI.Status()
            data = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
            tag = status.Get_tag()
            if tag == map_phase:
                key = data["k"]
                value = data["v"]
                mr.map(key, value)
            elif tag == reduce_phase:
                key = data["k"]
                value = data["v"]
                mr.reduce(key, value)
            elif tag == stop_phase:
                print("[" + str(rank) + "] - TERMINATED")
                is_terminated = True
            else:
                print("[" + str(rank) + "] - INCORRECT TAG RECEIVED")

            comm.isend(data, dest=master, tag=confirmation)

        mr.store_values()
    return out


def custom_reduce(data):
    d = dict()
    for key, value in data:
        if key not in d:
            d[key] = value
        else:
            d[key] += value

    out = []
    for key in d:
        out.append((key, d[key]))
    return out


## Same calls
mr_1 = MapReduce(verbose=False)
mr_1.map_one(custom_map, values_1)
mr_1.map_one(custom_map, values_2)
mr_1.map_one(custom_map, values_3)
print(mr_1.reduce(custom_reduce))

mr_2 = MapReduce(verbose=False)
mr_2.map(custom_map, all_values, 3)
print(mr_2.reduce(custom_reduce))

mr_3 = MapReduce(verbose=True)
print(mr_3.map_reduce(custom_map, custom_reduce, all_values, 3))
Esempio n. 10
0
	def __init__(self,corpus,minFreq=1):		
        	MapReduce.__init__(self)	
		self.corpus = corpus
		self.data = corpus.docs
		self.minFreq = minFreq
Esempio n. 11
0
	def __init__(self,corpus):
		MapReduce.__init__(self)	
		self.corpus = corpus
		self.data = corpus.docs
		self.totalDocs = len(self.data)
Esempio n. 12
0
 def test_raises_when_non_callable_map_reduce(self):
     with self.assertRaises(ValueError):
         MapReduce().map_reduce([], 0, lambda x: x)
     with self.assertRaises(ValueError):
         MapReduce().map_reduce([], lambda x: x, 0)
Esempio n. 13
0
 def test_raises_when_values_not_list(self):
     with self.assertRaises(ValueError):
         MapReduce().map_reduce(0, lambda x: x, lambda x: x)
Esempio n. 14
0
	def __init__(self,corpus):
		MapReduce.__init__(self)	
		self.corpus = corpus
		self.data = corpus.tweets
		self.totalTweets = len(self.data)
Esempio n. 15
0
    def test_raises_when_non_positive_num_workers(self):
        with self.assertRaises(ValueError):
            MapReduce(num_workers=0)

        with self.assertRaises(ValueError):
            MapReduce(num_workers=-1)
Esempio n. 16
0

def probability_calculation(item):
    """Read a file and return a sequence of (word, occurances) values.
    """

    print multiprocessing.current_process().name, 'calculating', item
    output = []
    IN_CIRCLE = 0
    for i in range(int(NBR_PER_WORKER)):
        x = numpy.random.randint(0, RADIUQ)
        y = numpy.random.randint(0, RADIUQ)
        if (numpy.sqrt(x**2 + y**2) < RADIUQ):
            IN_CIRCLE += 1
    output.append(('pi', IN_CIRCLE))
    return output


def estimate_pi(item):
    """Convert the partitioned data for a word to a
    tuple containing the word and the number of occurances.
    """
    key, occurances = item
    return (sum(occurances) / NBR_POINTS) * 4


if __name__ == '__main__':
    mapper = MapReduce(probability_calculation, estimate_pi)
    pi = mapper([i for i in range(NBR_WORKERS)])
    print pi
Esempio n. 17
0

def print_results(counts):
    top = counts[:10]
    longest = max(len(word) for word, count in top)
    for word, count in top:
        print('{}: {}'.format(word, count))


if __name__ == '__main__':
    import operator
    import glob

    input_files = glob.glob('text/*.txt')

    mapper = MapReduce(extract_ngrams, count_ngrams)
    bigram_counts = mapper(input_files)
    bigram_counts.sort(key=operator.itemgetter(1))
    bigram_counts.reverse()

    if bigram_counts:
        print('Top 10 bigrams by frequency:')
        print_results(bigram_counts)
    """Results:
    Top 10 bigrams by frequency:
    the_world: 9
    beauty_s: 7
    thou_art: 6
    that_thou: 5
    if_thou: 5
    to_be: 5
Esempio n. 18
0
class MapReduceManager(object):
    def __init__(self,
                 input_dir,
                 output_dir,
                 num_mappers,
                 num_reducers,
                 clean_splited_data=True):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.num_mappers = num_mappers
        self.num_reducers = num_reducers
        self.clean_splited_data = clean_splited_data
        self.clean = clean_splited_data
        self.map_reduce = MapReduce()
        self.file_handler = FileHandler(input_dir, output_dir)
        self.file_handler.split_file(self.num_mappers)

    def run_mapper(self, thread_id):
        key = None
        value = None
        with open(config.get_name_of_piece(thread_id), "r") as f:
            key = f.readline()
            value = f.read()
        if (self.clean_splited_data):
            os.unlink(config.get_name_of_piece(thread_id))
        mapper_result = self.map_reduce.mapper(key, value)
        slice_index = 0
        stride = len(mapper_result) // self.num_reducers
        for reducer_index in range(self.num_reducers):
            with open(config.get_map_file_name(thread_id, reducer_index),
                      "w+") as f:
                json.dump([
                    (key, value)
                    for (key, value) in mapper_result[slice_index:slice_index +
                                                      stride]
                ], f)
                slice_index += stride

    def run_reducer(self, thread_id):
        key_value_dict = {}
        for mapper_index in range(self.num_mappers):
            curr_map_json = None
            with open(config.get_map_file_name(mapper_index, thread_id)) as f:
                curr_map_json = json.load(f)
            for (key, value) in curr_map_json:
                if key in key_value_dict:
                    key_value_dict[key].append(value)
                else:
                    key_value_dict[key] = [value]
            if self.clean_splited_data:
                os.unlink(config.get_map_file_name(mapper_index, thread_id))
        result = [
            self.map_reduce.reducer(key, key_value_dict[key])
            for key in key_value_dict
        ]
        with open(config.get_reduce_result_file_name(thread_id), 'w+') as f:
            json.dump(result, f)

    def run(self):
        mappers = list()
        reducers = list()
        for thread_id in range(self.num_mappers):
            new_mapper = multiprocessing.Process(target=self.run_mapper,
                                                 args=(thread_id, ))
            new_mapper.start()
            mappers.append(new_mapper)
        list(map(lambda x: x.join(), mappers))
        for thread_id in range(self.num_reducers):
            new_reducer = multiprocessing.Process(target=self.run_reducer,
                                                  args=(thread_id, ))
            new_reducer.start()
            reducers.append(new_reducer)
        list(map(lambda x: x.join(), reducers))
Esempio n. 19
0
                word = word.lower()
                if word.isalpha() and word not in STOP_WORDS:
                    output.append((word, 1))
    return output


def words_frequency(item):
    """Convert the partitioned data for a word to a
    tuple containing the word and the number of occurances.
    """
    word, occurances = item
    return (word, sum(occurances))


if __name__ == '__main__':
    import operator
    import glob

    input_files = glob.glob('./*.txt')  # linux notation for directories

    mapper = MapReduce(map_words, words_frequency)
    word_counts = mapper(input_files)
    word_counts.sort(key=operator.itemgetter(1))
    word_counts.reverse()

    print '\nTOP 20 Iems by frequency\n'
    top20 = word_counts[:20]
    longest = max(len(word) for word, count in top20)
    for word, count in top20:
        print '%-*s: %5s' % (longest + 1, word, count)
	def __init__(self,corpus):		
        	MapReduce.__init__(self)	
		self.corpus = corpus
		self.data = corpus.docs
Esempio n. 21
0
# coding=utf-8
"""
    Main module
"""

import os
from map_reduce import MapReduce
import utils as util_functions

if __name__ == '__main__':
    folder_path = raw_input("Calea catre director: ")
    numar_procese = input("Numarul de procese: ")

    if not os.path.exists('{}\\out_files'.format(folder_path)):
        os.makedirs('{}\\out_files'.format(folder_path))

    if not os.path.exists('{}\\final_result'.format(folder_path)):
        os.makedirs('{}\\final_result'.format(folder_path))

    list_of_files = util_functions.read_directory_content(folder_path)

    map_reduce_object = MapReduce(util_functions.file_to_words, util_functions.count_words, folder_path, numar_procese)
    print '--------------------------------------------------------------- Prima Etapa - Etapa de Mapare'
    map_reduce_object.map_reduce(list_of_files)
    map_reduce_object.write_final_result()
Esempio n. 22
0
    def test_map_reduce_with_empty_values(self):
        map_reduce = MapReduce(num_workers=2)
        values = []

        self.assertEqual([], map_reduce.map_reduce(values, mapper, reducer))