def child_node_processor(self, comm):
        rank = comm.Get_rank()
        size = comm.Get_size()
        Utilities.write_log(rank, 'start rank:' + str(rank), True, self.IS_LOGGING_ENABLED)
        Utilities.write_log(rank, 'size:' + str(size), False, self.IS_LOGGING_ENABLED)

        final_child_data = self.process_tweet_data(comm)

        Utilities.write_log(rank, str(final_child_data), False, self.IS_LOGGING_ENABLED)
        while True:
            Utilities.write_log(rank, 'WAITING FOR COMMAND', False, self.IS_LOGGING_ENABLED)
            command = comm.recv(source=0, tag=rank)
            Utilities.write_log(rank, 'COMMAND RECEIVED ->' + str(command), False, self.IS_LOGGING_ENABLED)
            if command == 'send_data_back':
                comm.send(final_child_data, dest=0, tag=0)
            elif command == 'kill':
                exit(0)
Ejemplo n.º 2
0
    def __init__(self, argv, melb_grid_file_name, IS_LOGGING_ENABLED=False):
        # process melb_grid file
        with open(melb_grid_file_name, encoding="utf8") as mg_json_file:
            mg_data = json.loads(mg_json_file.read())
            for mgobj in mg_data["features"]:
                id = mgobj["properties"]["id"]
                self.melb_grid_obj[id] = mgobj["properties"]

        # process argv arguments
        self.data_file_name = Utilities.parse_cmd_arguments(argv)

        self.IS_LOGGING_ENABLED = IS_LOGGING_ENABLED
    def process_tweet_data(self, comm):
        # variables
        rank = comm.Get_rank()
        size = comm.Get_size()
        melb_grid_data = self.melb_grid_obj
        json_filename = self.data_file_name
        region_post_count = []
        region_hashtags_count = {}

        # reading file iteratively
        with open(json_filename) as input_file:
            for line_num, line in enumerate(input_file):
                if line_num % size == rank:
                    try:
                        # pre-processing of data
                        if line[-2:] == ",\n":
                            line = line[:-2]

                        # data loading
                        line_obj = json.loads(line)

                        ret = {
                            'id': line_obj['doc']['_id'],
                            'text': line_obj['doc']['text'],
                            'coordinates': line_obj['doc']['coordinates']['coordinates'],
                            'hashtags': Utilities.extract_hashtags_from_text(line_obj['doc']['text'])
                        }

                        # get region from tweet object
                        region_key = self.get_region_from_tweet(ret, rank, None)
                        # print(region_key)
                        if region_key is not None:
                            _f = Utilities.check_tuple_exists(region_post_count, region_key)
                            if _f is not None:
                                new_tup = (_f[1][0], _f[1][1] + 1)
                                region_post_count[_f[0]] = new_tup
                            else:
                                region_post_count.append((region_key, 1))

                        # process hashtags
                        if region_key is not None:
                            region_hashtags_count = self.set_hashtags_for_region(ret, region_key, region_hashtags_count,
                                                                                 rank, None)
                    except Exception as e:
                        # print(str(e))
                        pass

        region_post_count = Utilities.sort_tuple(region_post_count)
        Utilities.write_log(rank, str(region_post_count), False, self.IS_LOGGING_ENABLED)
        Utilities.write_log(rank, str(region_hashtags_count), False, self.IS_LOGGING_ENABLED)
        return (region_post_count, region_hashtags_count)
 def get_data_from_child_nodes(self, comm):
     data = []
     size = comm.Get_size()
     for i in range(size - 1):
         Utilities.write_log(0, 'SENDING COMMAND TO GET DATA BACK -> ' + str(i + 1), False, self.IS_LOGGING_ENABLED)
         comm.send('send_data_back', dest=(i + 1), tag=(i + 1))
     for i in range(size - 1):
         Utilities.write_log(0, 'RECEIVING DATA FROM ->' + str(i + 1), False, self.IS_LOGGING_ENABLED)
         data.append(comm.recv(source=(i + 1), tag=0))
         Utilities.write_log(0, 'Data Status->' + str(data), False, self.IS_LOGGING_ENABLED)
     return data
    def master_node_processor(self, comm):
        rank = comm.Get_rank()
        size = comm.Get_size()
        # print('start rank:' + str(rank))
        # print('size:' + str(size))
        Utilities.write_log(rank, 'start rank:' + str(rank), True, self.IS_LOGGING_ENABLED)
        Utilities.write_log(rank, 'size:' + str(size), False)

        final_data = self.process_tweet_data(comm)

        ret_data = []
        if size > 1:
            child_data_arr = self.get_data_from_child_nodes(comm)

            Utilities.write_log(0, str(child_data_arr), False, self.IS_LOGGING_ENABLED)

            for i in range(size - 1):
                comm.send('kill', dest=(i + 1), tag=(i + 1))

            # print('Merging Total Posts')
            for i, tup in enumerate(final_data[0]):
                inc_val = tup[1]
                for child_tup in child_data_arr:
                    _f = Utilities.check_tuple_exists(child_tup[0], tup[0])
                    if _f is not None:
                        inc_val = inc_val + _f[1][1]
                        final_data[0][i] = (tup[0], inc_val)
                    else:
                        # region found by master nodes which is not present in child nodes
                        # Assuming data is evenly distributed across nodes
                        pass

            # hash_tags
            # merger
            # print('HashTags Merger')
            master_node_dict = final_data[1]
            # print(len(master_node_dict))
            # for data_key in master_node_dict:
            for child_tuple in child_data_arr:
                for data_key in child_tuple[1]:
                    if data_key in master_node_dict:
                        master_node_dict[data_key] = master_node_dict[data_key] + child_tuple[1][data_key]
                    else:
                        master_node_dict[data_key] = child_tuple[1][data_key]

            # print(len(master_node_dict))
            # print('HashTag Reducer')
            reduced = {}
            reduced_hash = {}
            for data_key in master_node_dict:
                data_arr = data_key.split('||')
                master_key = data_arr[0]
                hash_key = data_arr[1]
                data_val = master_node_dict[data_key]
                if master_key not in reduced:
                    reduced[master_key] = []
                    reduced_hash[master_key] = []
                    _min = 0
                else:
                    _min = min(reduced_hash[master_key])

                if data_val >= _min:
                    if len(reduced_hash[master_key]) == 5:
                        _idx = reduced_hash[master_key].index(_min)
                        reduced_hash[master_key][_idx] = data_val
                        reduced[master_key][_idx] = (hash_key, data_val)

                if len(reduced_hash[master_key]) < 5:
                    reduced_hash[master_key].append(data_val)
                    reduced[master_key].append((hash_key, data_val))

            # print(reduced_hash)
            # print(reduced)
            for key in reduced:
                reduced[key] = Utilities.sort_tuple(reduced[key])

        elif size == 1:
            # only 1 thread is running - reducing required
            # hashtags - reducer
            # print('HashTag Reducer')
            # reducer
            # hashtags - reducer
            new_reduced = {}
            reduced = {}
            reduced_hash = {}
            for data_key in final_data[1]:
                data_arr = data_key.split('||')
                master_key = data_arr[0]
                hash_key = data_arr[1]
                data_val = final_data[1][data_key]
                if master_key not in reduced:
                    reduced[master_key] = []
                    reduced_hash[master_key] = []
                    _min = 0
                else:
                    _min = min(reduced_hash[master_key])

                if data_val >= _min:
                    if len(reduced_hash[master_key]) == 5:
                        _idx = reduced_hash[master_key].index(_min)
                        reduced_hash[master_key][_idx] = data_val
                        reduced[master_key][_idx] = (hash_key, data_val)

                if len(reduced_hash[master_key]) < 5:
                    reduced_hash[master_key].append(data_val)
                    reduced[master_key].append((hash_key, data_val))

            for key in reduced:
                reduced[key] = Utilities.sort_tuple(reduced[key])

        final_data = (final_data[0], reduced)
        return final_data
Ejemplo n.º 6
0
 def start_processing(self):
     _manager = MpiGeoManager(self.melb_grid_obj, self.data_file_name,
                              self.IS_LOGGING_ENABLED)
     output = _manager.start_processing()
     Utilities.generate_output(output)