Esempio n. 1
0
 def __init__(self, config, all_terms_dict):
     self.inverted_idx = all_terms_dict
     #self.postingDict = {}
     self.fileName = 'InvertedIndex'
     self.config = config
     # {term: [ordered list where appear : (file_id , lineNumber)]}
     self.thread_pool_size = 1
     avg_ram = (psutil.virtual_memory().available // 5)//(self.thread_pool_size +1)
     path = 'MapReduceData/'
     self.avg_length =(avg_ram // sys.getsizeof((int(), str()))) // (8/10)
     # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size)
     self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size, path + 'AG/')
     self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size, path + 'HQ/')
     self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size, path + 'Rz/')
     self.map_reduce_other = MapReduce(self.avg_length, self.thread_pool_size, path + 'Others/')
     self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size, path + 'Document/')
     self.tmp_pos = {}
     # self.num_in_pos_tmp = 0
     self.num_in_pos_ag_tmp = [0]
     self.num_in_pos_hq_tmp = [0]
     self.num_in_pos_rz_tmp = [0]
     self.num_in_pos_other_tmp = [0]
     self.num_in_pos_doc_other = [0]
     self.Entitys = {}
     self.tmp_pos_ag = {}
     self.tmp_pos_hq = {}
     self.tmp_pos_rz = {}
     self.tmp_pos_other = {}
     self.tmp_pos_doc = {}
     # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
     self.NUMBER_OF_PROCESSES = 5
     self.set_is_writting = {}
Esempio n. 2
0
    def page_rank(self):
        """
        计算PR值,每次迭代都需要两次调用MapReduce。一次是计算悬挂网页PR值之和,一次
        是计算所有网页的PR值
        :return: self.graph,其中的PR值已经计算好
        """
        iteration = 1  # 迭代次数
        change = 1  # 记录每轮迭代后的PR值变化情况,初始值为1保证至少有一次迭代
        while change > self.min_delta:
            print("Iteration: " + str(iteration))

            # 因为可能存在悬挂网页,所以才有下面这个dangling_list
            # dangling_list存放的是[所有悬挂网页的PR值之和]
            # dp表示所有悬挂网页的PR值之和
            dangling_list = MapReduce.map_reduce(self.graph, self.ip_mapper, self.ip_reducer)
            if dangling_list:
                dp = dangling_list[0]
            else:
                dp = 0

            # 因为MapReduce.map_reduce中要求的reducer只能有两个参数,而我们
            # 需要传3个参数(多了一个所有悬挂网页的PR值之和,即dp),所以采用
            # 下面的lambda表达式来达到目的
            # new_pr为一个列表,元素为:(网页名,计算所得的PR值)
            new_pr = MapReduce.map_reduce(self.graph, self.pr_mapper, lambda x, y: self.pr_reducer_inter(x, y, dp))

            # 计算此轮PR值的变化情况
            change = sum([abs(new_pr[i][1] - self.graph[new_pr[i][0]][0]) for i in range(self.num_of_pages)])
            print("Change: " + str(change))

            # 更新PR值
            for i in range(self.num_of_pages):
                self.graph[new_pr[i][0]][0] = new_pr[i][1]
            iteration += 1
        return self.graph
Esempio n. 3
0
def search_and_rank_query(query, inverted_index,num_docs_to_retrieve):
    p = Parse()
    dictFromQuery = {}
    p.tokenSplit(query, dictFromQuery)
    query_as_list = [*dictFromQuery]
    searcher = Searcher(inverted_index)
    #posting = utils.load_obj("posting")
    print('-------------------------------------')
    print('Start import mapReduce')
    map_reduce = MapReduce.import_map_reduce('MapReduceData/')
    print('Done importing mapReduce')
    posting = {}
    print('-------------------------------------')
    print('Start build posting file')
    for term in query_as_list:
        posting[term] = map_reduce.read_from(term)
    print('Done building posting file')
    print('-------------------------------------')
    print('Get relevant Doc')
    relevant_docs = searcher.relevant_docs_from_posting(query_as_list,posting)
    print('Done getting relevant Doc')
    print('-------------------------------------')
    print('Start ranking docs')
    ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs,dictFromQuery,posting,num_docs_to_retrieve)
    print('Done ranking docs')
    return searcher.ranker.retrieve_top_k(ranked_docs,num_docs_to_retrieve)
Esempio n. 4
0
 def create_c_of_doc(top_relevant_docs, dictFromQuery, posting):
     # load map reduce from file
     # relavent doc : # {num : [score,doc_tuple, {index}]}
     # c[term,term2] = sum[k](term1 in doc k * term2 in doc k)
     #  = > {}
     queryAsList = [*dictFromQuery]
     map_reduce = MapReduce.import_map_reduce('MapReduceData/')
     c_matrix = {}  # {term: {'other term' : value}}
     for doc_id in top_relevant_docs.keys():
         if doc_id != 'META-DATA':
             info_list = map_reduce.read_from(('Document', doc_id))
             doc_term_freq_dict = info_list
             max_freq = info_list[1]
             if len(doc_term_freq_dict) == 0:
                 continue
             doc_term_freq_dict = doc_term_freq_dict[0]
             for term_doc1, term_doc_freq1 in doc_term_freq_dict.items():
                 #for queryIndex in top_relevant_docs[doc_id][2]:
                 if term_doc1 not in c_matrix.keys():
                     c_matrix[term_doc1] = {}
                 for term_doc2, term_doc_freq2 in doc_term_freq_dict.items(
                 ):
                     if term_doc1 in dictFromQuery.keys(
                     ) or term_doc1 == term_doc2:
                         if term_doc2 not in c_matrix[term_doc1]:
                             c_matrix[term_doc1][term_doc2] = 0
                         c_matrix[term_doc1][
                             term_doc2] += term_doc_freq1 * term_doc_freq2  #Cii,Cjj,Cij
     return c_matrix
def main():

    print("Reading files...")
    input_files = os.listdir('Data')

    START_TIME = time()
    mapper = MapReduce(file_to_words, count_words)
    word_counts, MAPPING_TIME, REFORMATING_TIME, REDUCING_TIME = mapper(
        input_files)

    word_counts.sort(key=operator.itemgetter(1))
    word_counts.reverse()

    print("\nTOP 20 WORDS BY FREQUENCY\n")
    top20 = word_counts[:20]
    longest = max(len(word) for word, count in top20)

    for word, count in top20:
        print('%-*s: %5s' % (longest + 1, word, count))

    END_TIME = time()

    print("\nMapping time = {} s".format(MAPPING_TIME))
    print("Reformatting time = {} s".format(REFORMATING_TIME))
    print("Reducing time = {} s".format(REDUCING_TIME))
    print("Total running time = {} s".format(END_TIME - START_TIME))
Esempio n. 6
0
def search_and_rank_query(query, inverted_index, num_docs_to_retrieve):
    p = Parse()
    dictFromQuery = {}
    map_reduce_ag = MapReduce.import_map_reduce('MapReduceData/AG/')
    map_reduce_hq = MapReduce.import_map_reduce('MapReduceData/HQ/')
    map_reduce_rz = MapReduce.import_map_reduce('MapReduceData/Rz/')
    map_reduce_other = MapReduce.import_map_reduce('MapReduceData/Others/')
    map_reduce_doc = MapReduce.import_map_reduce('MapReduceData/Document/')
    p.tokenSplit(query, dictFromQuery)
    query_as_list = [*dictFromQuery]
    searcher = Searcher(inverted_index)
    #posting = utils.load_obj("posting")
    print('-------------------------------------')
    print('Start import mapReduce')
    # map_reduce = MapReduce.import_map_reduce('MapReduceData/')

    print('Done importing mapReduce')
    posting = {}
    print('-------------------------------------')
    print('Start build posting file')
    query_as_list.sort(key=lambda x: x.lower())
    for term in query_as_list:
        lower_letter = term[0].lower()
        current_map = map_reduce_other
        if 'a' <= lower_letter <= 'g':
            current_map = map_reduce_ag
        elif 'h' <= lower_letter <= 'q':
            current_map = map_reduce_hq
        elif 'r' <= lower_letter <= 'z':
            current_map = map_reduce_rz
        posting[term] = current_map.read_from(term.lower())
    print('Done building posting file')
    print('-------------------------------------')
    print('Get relevant Doc')
    relevant_docs = searcher.relevant_docs_from_posting(query_as_list, posting)
    print('Done getting relevant Doc')
    print('-------------------------------------')
    print('Start ranking docs')
    ranked_docs = searcher.ranker.rank_relevant_doc(
        relevant_docs, dictFromQuery, posting, map_reduce_ag, map_reduce_hq,
        map_reduce_rz, map_reduce_other, num_docs_to_retrieve)
    print('Done ranking docs')
    return searcher.ranker.retrieve_top_k(ranked_docs, num_docs_to_retrieve)
Esempio n. 7
0
__author__ = 'Chiru'
import sys
from MapReduce import MapReduce
mr = MapReduce()

#Mapper is called for every record in the data file
def mapper(record):
    #Structure of record : every record has record_type in first field,order_id in the second field
    record_type = record[0]
    order_id = record[1]

    #Mapper will be called on all the orders first(As the data file contains the records of orders before line_items)
    #As the output of mapper is fed to reducer,all the output to be displayed need to be present in the output of the Mapper.
    #So emit_intermdiate all the records with order_id as the key
    #So for every record the id,record is emmited
    if record_type == "order":
        mr.emit_intermediate(order_id,record)
    elif record_type == "line_item":
        mr.emit_intermediate(order_id,record)

#Reducer funtion is called for every record in the output of the map phase (here it is the global dictionary mr.intermediate)
def reducer(key,list_of_values):
    #for all list_of_values first field will be order record and all the others will be line_item records
    #ie list_of_values[0] will be the orders and list_of_values[1:n] will be the list_item records

    #Every order is emitted with all the list_items having the same order ID
    current = 1;
    while current < len(list_of_values):
        mr.emit((list_of_values[0],list_of_values[current]))
        current = current + 1;
Esempio n. 8
0
import sys

from MapReduce import MapReduce

width = 5
height = 5
mr = MapReduce()


def mapper(record):
    name, row, col, value = record
    if name == 'a':
        for n in range(width):
            mr.emit_intermediate((row, n), record)
    else:
        for n in range(height):
            mr.emit_intermediate((n, col), record)


def reducer(key, values):
    a = [v for v in values if v[0] == 'a']
    b = [v for v in values if v[0] == 'b']
    total = 0
    for m in a:
        for n in b:
            if n[1] == m[2]:
                total = total + n[-1] * m[-1]
    mr.emit((key[0], key[1], total))


def main():
Esempio n. 9
0
from MapReduce import MapReduce

mr = MapReduce()


def mapper(record):
    # key: word
    # value: filename
    value = record[0]
    text = record[1]
    for key in text.split():
        mr.emit_intermediate(key, value)


def reducer(key, values):
    # key: word
    # value: list of filenames
    mr.emit((key, list(set(values))))


if __name__ == '__main__':
    import sys, json
    inputdata = open(sys.argv[1])
    # inputdata = open('./data/books.json')
    mr.execute(inputdata, mapper, reducer)

    with open('inverted_index.json', 'w') as outfile:
        json.dump(mr.result, outfile)
Esempio n. 10
0
import sys
import json
from math import ceil

from MapReduce import MapReduce
map_reduce_obj = MapReduce()


def mapper(record):
    number_of_baskets=len(record)
    candidate_list=open(sys.argv[2])
    for candidate in candidate_list:
        count=0
        candidate=json.loads(candidate.strip())
        for candidate_chunk in record:
            if not set(candidate)-set(candidate_chunk):
               count+=1
        map_reduce_obj.emit_intermediate(tuple(candidate),(count,number_of_baskets))


def reduce(key,list_of_value):
    total_count=0
    total_baskets=0
    for item in list_of_value:
        total_count+=item[0]
        total_baskets+=item[1]
    threshold=ceil(total_baskets*0.3)
    if total_count>=threshold:
        map_reduce_obj.emit([list(key),total_count])

Esempio n. 11
0
import sys
from MapReduce import MapReduce

mr = MapReduce()

def mapper(record):
    # key: document identifier
    # value: document contents
    key, seq = record
    mr.emit_intermediate(seq[:-10], 1)

# Part 3
def reducer(key, list_of_values):
    # key: word
    # value: list of occurrence counts
    mr.emit(key)

# Part 4
with open(sys.argv[1]) as f:
    mr.execute(f, mapper, reducer)
from MapperMatrixVector import MapperMatrixVector
from ReducerMatrixVector import ReducerMatrixVector
from MapReduce import MapReduce
from FileHelper import FileHelper

# Create instances for mapper and reducer
# Note that the vector is stored in the instance
theReducerMatrixVector = ReducerMatrixVector();
theMapperMatrixVector = MapperMatrixVector('dataFiles/b');

# the file where the matrix is stored
matrixFile = ['dataFiles/A'];

# MapReduce
theMapReducerMatrixVector = MapReduce(theMapperMatrixVector,theReducerMatrixVector,matrixFile,0,1)
resultDict = theMapReducerMatrixVector.execute();

# Write output
outFileFirectory = 'outputs/'
outfileName = 'matrixVectorResults.txt';
FileHelper.writeDictionnary(outFileFirectory+outfileName,resultDict)
Esempio n. 13
0
class Indexer:
    def __init__(self, config, all_terms_dict):
        self.inverted_idx = all_terms_dict
        #self.postingDict = {}
        self.fileName = 'InvertedIndex'
        self.config = config
        # {term: [ordered list where appear : (file_id , lineNumber)]}
        self.thread_pool_size = 1
        avg_ram = (psutil.virtual_memory().available //
                   10) // (self.thread_pool_size + 1)
        path = 'MapReduceData/'
        self.avg_length = (avg_ram // sys.getsizeof(
            (int(), str()))) // (8 / 10)
        # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size)
        self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'AG/')
        self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'HQ/')
        self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'Rz/')
        self.map_reduce_other = MapReduce(self.avg_length,
                                          self.thread_pool_size,
                                          path + 'Others/')
        self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size,
                                        path + 'Document/')
        # self.tmp_pos = {}
        # self.num_in_pos_tmp = 0
        self.num_in_pos_ag_tmp = [0]
        self.num_in_pos_hq_tmp = [0]
        self.num_in_pos_rz_tmp = [0]
        self.num_in_pos_other_tmp = [0]
        self.num_in_pos_doc = [0]
        self.Entitys = {}
        self.tmp_pos_ag = {}
        self.tmp_pos_hq = {}
        self.tmp_pos_rz = {}
        self.tmp_pos_other = {}
        self.tmp_pos_doc = {}
        # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
        self.NUMBER_OF_PROCESSES = 5
        self.set_is_writting = {}

    def get_right_tmp_pos_and_num(self, first_letter):
        lower_letter = str(first_letter).lower()
        if 'a' <= lower_letter <= 'g':
            return [
                self.tmp_pos_ag, self.num_in_pos_ag_tmp, self.map_reduce_ag,
                'ag'
            ]
        elif 'h' <= lower_letter <= 'q':
            return [
                self.tmp_pos_hq, self.num_in_pos_hq_tmp, self.map_reduce_hq,
                'hq'
            ]
        elif 'r' <= lower_letter <= 'z':
            return [
                self.tmp_pos_rz, self.num_in_pos_rz_tmp, self.map_reduce_rz,
                'rz'
            ]
        return [
            self.tmp_pos_other, self.num_in_pos_other_tmp,
            self.map_reduce_other, 'others'
        ]

    def save_left_over(self, dict, map_reduce):
        map_reduce.write_dict(dict)
        map_reduce.wait_untill_finish()

    def save_all_map_reduce(self):
        with ProcessPoolExecutor() as process_exector:
            process_exector.map(self.map_reduce_ag.save_map_reduce())
            process_exector.map(self.map_reduce_hq.save_map_reduce())
            process_exector.map(self.map_reduce_rz.save_map_reduce())
            process_exector.map(self.map_reduce_other.save_map_reduce())
            process_exector.map(self.map_reduce_doc.save_map_reduce())

    def check_save_left_over_ag(self):
        if self.num_in_pos_ag_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_ag, self.map_reduce_ag)
            self.num_in_pos_ag_tmp[0] = 0
            self.map_reduce_ag.wait_untill_finish()

    def check_save_left_over_hq(self):
        if self.num_in_pos_hq_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_hq, self.map_reduce_hq)
            self.num_in_pos_hq_tmp[0] = 0
            self.map_reduce_hq.wait_untill_finish()

    def check_save_left_over_rz(self):
        if self.num_in_pos_rz_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_rz, self.map_reduce_rz)
            self.num_in_pos_rz_tmp[0] = 0
            self.map_reduce_rz.wait_untill_finish()

    def check_save_left_over_others(self):
        if self.num_in_pos_other_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_other, self.map_reduce_other)
            self.num_in_pos_other_tmp[0] = 0
            self.map_reduce_other.wait_untill_finish()

    def check_save_left_over_doc(self):
        if self.num_in_pos_doc[0] > 0:
            self.save_left_over(self.tmp_pos_doc, self.map_reduce_doc)
            self.num_in_pos_doc[0] = 0
            self.map_reduce_doc.wait_untill_finish()

    def save_all_left_overs(self):
        with ProcessPoolExecutor() as process_exector:
            process_exector.map(self.check_save_left_over_ag())
            process_exector.map(self.check_save_left_over_hq())
            process_exector.map(self.check_save_left_over_rz())
            process_exector.map(self.check_save_left_over_others())
            process_exector.map(self.check_save_left_over_doc())

    def print_meta_data_len(self):
        print('________________________________________________')
        print('Ag:' + str(len(self.map_reduce_ag.meta_data)))
        print('HG:' + str(len(self.map_reduce_hq.meta_data)))
        print('RZ:' + str(len(self.map_reduce_rz.meta_data)))
        print('OTHER:' + str(len(self.map_reduce_other.meta_data)))
        print('Doc:' + str(len(self.map_reduce_doc.meta_data)))
        print('________________________________________________')
        # return len(self.map_reduce_ag.meta_data) + len(self.map_reduce_hq.meta_data) + len(self.map_reduce_rz.meta_data) + len(self.map_reduce_other.meta_data) + len(self.map_reduce_ag.meta_data)

    def addEntitysToPosting(self, term, tweet_id, quantity):
        str_term = str(term)
        first_letter = str_term[0]
        tmp_pos, number_arr, map_reduce, _ = self.get_right_tmp_pos_and_num(
            first_letter)
        # first time seeing post (might be in)
        if term.upper() not in self.Entitys.keys() and term.upper(
        ) not in tmp_pos.keys() and term.lower(
        ) not in map_reduce.meta_data.keys():
            self.Entitys[term.upper()] = (tweet_id, quantity)
        else:
            if term.upper() not in tmp_pos.keys():
                tmp_pos[term.upper()] = []
            if term.upper() in self.Entitys.keys():  # secound time seeing it
                self.inverted_idx[term.upper()] = 2
                tmp_pos[term.upper()].append(self.Entitys[term.upper()])
                tmp_pos[term.upper()].append((tweet_id, quantity))
                number_arr[0] += 2
                self.inverted_idx[term.upper()] = 2
            else:
                self.inverted_idx[term.upper()] += 1
                tmp_pos[term.upper()].append((tweet_id, quantity))
                number_arr[0] += 1

    def add_new_doc(self, document):
        """
        This function perform indexing process for a document object.
        Saved information is captures via two dictionaries ('inverted index' and 'posting')
        :param document: a document need to be indexed.
        :return: -
        """
        document_dictionary = document.term_doc_dictionary  #{term:freq,term:freq}
        term_lst = [*document_dictionary]
        term_lst.sort(key=lambda x: x.lower())
        for i in range(len(term_lst)):
            term = term_lst[i]
            tmp_pos, number_arr, map_reduce, key = self.get_right_tmp_pos_and_num(
                term[0])
            try:
                if term[0].isupper() and " " in term:
                    self.addEntitysToPosting(term, document.tweet_id,
                                             document_dictionary[term])
                    continue
                if number_arr[0] >= self.avg_length:
                    map_reduce.write_dict(tmp_pos)
                    self.set_is_writting[key] = 1
                    number_arr[0] = 0
                if key in self.set_is_writting.keys():
                    print('Waiting to write to end ')
                    map_reduce.wait_untill_finish()
                    print('Done waiting')
                    del self.set_is_writting[key]
                if term.lower() not in tmp_pos.keys():
                    tmp_pos[term.lower()] = []
                tmp_pos[term.lower()].append(
                    (document.tweet_id, document_dictionary[term]))
                number_arr[0] += 1
            except:
                print('INVERTED: problem with the following key {}'.format(
                    term[0]))
        max_freq = max([document_dictionary.values()])
        self.tmp_pos_doc[document.tweet_id] = document_dictionary
        self.num_in_pos_doc[0] += 1
        if self.num_in_pos_doc[0] >= self.avg_length:
            if 'doc' not in self.set_is_writting.keys():
                self.map_reduce_doc.write_dict(self.tmp_pos_doc)
                self.set_is_writting['doc'] = 1
            else:
                self.map_reduce_doc.wait_untill_finish()
                del self.set_is_writting['doc']
                self.num_in_pos_doc[0] = 0
Esempio n. 14
0
from MapReduce import MapReduce
import itertools
import sys

map_reduce_obj = MapReduce()


def mapper(record):
    map_reduce_obj.emit_intermediate(record[0], record[1])
    map_reduce_obj.emit_intermediate(record[1], record[0])


def reducer(key, list_of_values):
    value_group = list(itertools.combinations(list_of_values, 2))
    for value in value_group:
        value = list(value)
        value.sort()
        value.append(key)
        map_reduce_obj.emit(value)


if __name__ == '__main__':
    input_data = open(sys.argv[1])
    map_reduce_obj.execute(input_data, mapper, reducer)
Esempio n. 15
0
 def __init__(self):
     self.map_reduce_ag = MapReduce.import_map_reduce('MapReduceData/AG/')
     self.map_reduce_hq = MapReduce.import_map_reduce('MapReduceData/HQ/')
     self.map_reduce_rz = MapReduce.import_map_reduce('MapReduceData/RZ/')
     self.map_reduce_other = MapReduce.import_map_reduce('MapReduceData/Others/')
Esempio n. 16
0
class Indexer:
    def __init__(self, config, all_terms_dict):
        self.inverted_idx = all_terms_dict
        #self.postingDict = {}
        self.fileName = 'InvertedIndex'
        self.config = config
        # {term: [ordered list where appear : (file_id , lineNumber)]}
        self.thread_pool_size = 2
        avg_ram = (psutil.virtual_memory().available //
                   self.thread_pool_size) // 10
        path = 'MapReduceData/'
        self.avg_length = (avg_ram // sys.getsizeof(
            (int(), str()))) // (8 / 10)
        # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size)
        self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'AG/')
        self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'HQ/')
        self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'Rz/')
        self.map_reduce_other = MapReduce(self.avg_length,
                                          self.thread_pool_size,
                                          path + 'Others/')
        self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size,
                                        path + 'Document/')
        self.tmp_pos = {}
        # self.num_in_pos_tmp = 0
        self.num_in_pos_ag_tmp = [0]
        self.num_in_pos_hq_tmp = [0]
        self.num_in_pos_rz_tmp = [0]
        self.num_in_pos_other_tmp = [0]
        self.num_in_pos_doc_other = [0]
        self.Entitys = {}
        self.tmp_pos_ag = {}
        self.tmp_pos_hq = {}
        self.tmp_pos_rz = {}
        self.tmp_pos_other = {}
        self.tmp_pos_doc = {}
        # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
        self.NUMBER_OF_PROCESSES = 5

    def get_right_tmp_pos_and_num(self, first_letter):
        lower_letter = str(first_letter).lower()
        if 'a' <= lower_letter <= 'g':
            return [
                self.tmp_pos_ag, self.num_in_pos_ag_tmp, self.map_reduce_ag
            ]
        elif 'h' <= lower_letter <= 'q':
            return [
                self.tmp_pos_hq, self.num_in_pos_hq_tmp, self.map_reduce_hq
            ]
        elif 'r' <= lower_letter <= 'z':
            return [
                self.tmp_pos_rz, self.num_in_pos_rz_tmp, self.map_reduce_rz
            ]
        return [
            self.tmp_pos_other, self.num_in_pos_other_tmp,
            self.map_reduce_other
        ]

    def wait_untill_all_finish(self):
        self.map_reduce_ag.wait_untill_finish()
        self.map_reduce_hq.wait_untill_finish()
        self.map_reduce_rz.wait_untill_finish()
        self.map_reduce_other.wait_untill_finish()
        self.map_reduce_doc.wait_untill_finish()

    def save_left_over(self, dict, map_reduce):
        map_reduce.write_dict_func(dict)

    def check_save_left_over_ag(self):
        if self.num_in_pos_ag_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_ag, self.map_reduce_ag)
            self.num_in_pos_ag_tmp[0] = 0

    def check_save_left_over_hq(self):
        if self.num_in_pos_hq_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_hq, self.map_reduce_hq)
            self.num_in_pos_hq_tmp[0] = 0

    def check_save_left_over_rz(self):
        if self.num_in_pos_rz_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_rz, self.map_reduce_rz)
            self.num_in_pos_rz_tmp[0] = 0

    def check_save_left_over_others(self):
        if self.num_in_pos_other_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_other, self.map_reduce_other)
            self.num_in_pos_other_tmp[0] = 0

    def check_save_left_over_doc(self):
        if self.num_in_pos_doc_other[0] > 0:
            self.save_left_over(self.tmp_pos_doc, self.map_reduce_doc)
            self.num_in_pos_doc_other[0] = 0

    def save_all_left_overs(self):
        # self.check_save_left_over_ag()
        # self.check_save_left_over_doc()
        # self.check_save_left_over_hq()
        # self.check_save_left_over_rz()
        # self.check_save_left_over_others()
        with ProcessPoolExecutor() as process_exector:
            process_exector.submit(self.check_save_left_over_ag())
            process_exector.submit(self.check_save_left_over_hq())
            process_exector.submit(self.check_save_left_over_rz())
            process_exector.submit(self.check_save_left_over_others())
            process_exector.submit(self.check_save_left_over_doc())

    def add_entitys_to_posting(self, term, tweet_id, quantity):
        first_letter = term[0]
        tmp_pos, number_arr, _ = self.get_right_tmp_pos_and_num(first_letter)
        if term.upper() not in self.Entitys.keys() and term.upper(
        ) not in tmp_pos.keys():
            self.Entitys[term.upper()] = (tweet_id, quantity)
        else:
            if term.upper() not in self.inverted_idx.keys():
                self.inverted_idx[term.upper()] = 2
            else:
                self.inverted_idx[term.upper()] += 1
            if term.upper() not in tmp_pos.keys():
                tmp_pos[term.upper()] = []
                tmp_pos[term.upper()].append(self.Entitys[term.upper()])
                del self.Entitys[term.upper()]
            tmp_pos[term.upper()].append((tweet_id, quantity))

    def add_new_doc(self, document):
        """
        This function perform indexing process for a document object.
        Saved information is captures via two dictionaries ('inverted index' and 'posting')
        :param document: a document need to be indexed.
        :return: -
        """
        document_dictionary = document.term_doc_dictionary  #{term:freq,term:freq}
        term_lst = [*document_dictionary]
        term_lst.sort(key=lambda x: x.lower())
        for i in range(len(term_lst)):
            term = term_lst[i]
            tmp_pos, number_arr, map_reduce = self.get_right_tmp_pos_and_num(
                term[0])
            try:
                if term[0].isupper() and " " in term:
                    self.add_entitys_to_posting(term, document.tweet_id,
                                                document_dictionary[term])
                    continue
                if number_arr[0] >= self.avg_length:
                    map_reduce.write_dict_func(tmp_pos)
                    number_arr[0] = 0
                if term.lower() not in self.tmp_pos.keys():
                    tmp_pos[term.lower()] = []
                tmp_pos[term.lower()].append(
                    (document.tweet_id, document_dictionary[term]))
                number_arr[0] += 1
            except:
                print('problem with the following key {}'.format(term[0]))
        max_freq = max([document_dictionary.values()])
        self.tmp_pos_doc[document.tweet_id] = document_dictionary
        self.num_in_pos_doc_other[0] += 1
        if self.num_in_pos_doc_other[0] >= self.avg_length:
            self.map_reduce_doc.write_dict_func(self.tmp_pos_doc)
            self.num_in_pos_doc_other[0] = 0
Esempio n. 17
0
from MapperCountingWords import MapperCountingWords
from ReducerCountingWords import ReducerCountingWords
from MapReduce import MapReduce
from FileHelper import FileHelper

# Create instances for mapper and reducer
theMapper = MapperCountingWords();
theReducer = ReducerCountingWords();

# parse the file : one word/line
inFiles = ['dataFiles/text'];

# we can have more than one text file
inFileParsed = 'dataFiles/textParsed';
FileHelper.transformTextIntoListOfWords(inFiles,inFileParsed)

# MapReduce
theMapReducer = MapReduce(theMapper,theReducer,[inFileParsed],silent=-1,nThreads=5)
resultDict = theMapReducer.execute()

# Write output
outFileFirectory = 'outputs/'
outfileName = 'coutingWordsResults.txt';
FileHelper.writeDictionnary(outFileFirectory+outfileName,resultDict)