Esempio n. 1
0
 def __init__(self, config, all_terms_dict):
     self.inverted_idx = all_terms_dict
     #self.postingDict = {}
     self.fileName = 'InvertedIndex'
     self.config = config
     # {term: [ordered list where appear : (file_id , lineNumber)]}
     self.thread_pool_size = 1
     avg_ram = (psutil.virtual_memory().available // 5)//(self.thread_pool_size +1)
     path = 'MapReduceData/'
     self.avg_length =(avg_ram // sys.getsizeof((int(), str()))) // (8/10)
     # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size)
     self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size, path + 'AG/')
     self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size, path + 'HQ/')
     self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size, path + 'Rz/')
     self.map_reduce_other = MapReduce(self.avg_length, self.thread_pool_size, path + 'Others/')
     self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size, path + 'Document/')
     self.tmp_pos = {}
     # self.num_in_pos_tmp = 0
     self.num_in_pos_ag_tmp = [0]
     self.num_in_pos_hq_tmp = [0]
     self.num_in_pos_rz_tmp = [0]
     self.num_in_pos_other_tmp = [0]
     self.num_in_pos_doc_other = [0]
     self.Entitys = {}
     self.tmp_pos_ag = {}
     self.tmp_pos_hq = {}
     self.tmp_pos_rz = {}
     self.tmp_pos_other = {}
     self.tmp_pos_doc = {}
     # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
     self.NUMBER_OF_PROCESSES = 5
     self.set_is_writting = {}
def main():

    print("Reading files...")
    input_files = os.listdir('Data')

    START_TIME = time()
    mapper = MapReduce(file_to_words, count_words)
    word_counts, MAPPING_TIME, REFORMATING_TIME, REDUCING_TIME = mapper(
        input_files)

    word_counts.sort(key=operator.itemgetter(1))
    word_counts.reverse()

    print("\nTOP 20 WORDS BY FREQUENCY\n")
    top20 = word_counts[:20]
    longest = max(len(word) for word, count in top20)

    for word, count in top20:
        print('%-*s: %5s' % (longest + 1, word, count))

    END_TIME = time()

    print("\nMapping time = {} s".format(MAPPING_TIME))
    print("Reformatting time = {} s".format(REFORMATING_TIME))
    print("Reducing time = {} s".format(REDUCING_TIME))
    print("Total running time = {} s".format(END_TIME - START_TIME))
Esempio n. 3
0
import sys

from MapReduce import MapReduce

width = 5
height = 5
mr = MapReduce()


def mapper(record):
    name, row, col, value = record
    if name == 'a':
        for n in range(width):
            mr.emit_intermediate((row, n), record)
    else:
        for n in range(height):
            mr.emit_intermediate((n, col), record)


def reducer(key, values):
    a = [v for v in values if v[0] == 'a']
    b = [v for v in values if v[0] == 'b']
    total = 0
    for m in a:
        for n in b:
            if n[1] == m[2]:
                total = total + n[-1] * m[-1]
    mr.emit((key[0], key[1], total))


def main():
Esempio n. 4
0
from MapReduce import MapReduce
import itertools
import sys

map_reduce_obj = MapReduce()


def mapper(record):
    map_reduce_obj.emit_intermediate(record[0], record[1])
    map_reduce_obj.emit_intermediate(record[1], record[0])


def reducer(key, list_of_values):
    value_group = list(itertools.combinations(list_of_values, 2))
    for value in value_group:
        value = list(value)
        value.sort()
        value.append(key)
        map_reduce_obj.emit(value)


if __name__ == '__main__':
    input_data = open(sys.argv[1])
    map_reduce_obj.execute(input_data, mapper, reducer)