class Reducer: def __init__(self, hash_maps: list): self.hash_maps = hash_maps self.reduced_hash_map = HashMap() self.top_ten_words = [] def reduce(self): for hash_map in self.hash_maps: for bucket in hash_map.table: self.add_to_new_hash_table(bucket) def add_to_new_hash_table(self, bucket): if isinstance(bucket, LinkedList): self.add_linked_list_to_new_hash_table(bucket) elif isinstance(bucket, tuple): hashed_key = self.reduced_hash_map.hash_string(bucket[0]) self.reduced_hash_map.add_to_hash_table(hashed_key, (bucket[0], 1)) def add_linked_list_to_new_hash_table(self, bucket: LinkedList): bucket_keys = [] for node in bucket.traverse(): if node.data[0] in bucket_keys: continue if isinstance(node.data, tuple): bucket_keys = self.add_nodes_to_new_hash_table( bucket_keys, bucket, node) def add_nodes_to_new_hash_table(self, bucket_keys: list, current_bucket: LinkedList, current_node: Node) -> list: current_count = 0 bucket_keys.append(current_node.data[0]) for next_node in current_bucket.traverse(): if next_node.data[0] == current_node.data[0]: current_count += 1 hashed_key = self.reduced_hash_map.hash_string(current_node.data[0]) self.reduced_hash_map.add_to_hash_table( hashed_key, (current_node.data[0], current_count)) return bucket_keys def get_top_ten_words(self): top_words_per_bucket = [] for bucket in self.reduced_hash_map.table: if isinstance(bucket, tuple): top_words_per_bucket.append(bucket) elif isinstance(bucket, LinkedList): top_three = [(" ", 0), (" ", 0), (" ", 0)] for node in bucket.traverse(): top_three.sort(key=itemgetter(1)) for index, value in enumerate(top_three): if node.data[1] > value[1]: top_three[index] = node.data break top_words_per_bucket.extend(top_three) top_words_per_bucket.sort(key=itemgetter(1)) return top_words_per_bucket[-10:]
def add_words_to_hash_map(self, current_hash_map: HashMap, current_line: str): if self.line_count >= 50: self.line_count = 0 self.hash_maps.append(current_hash_map) current_hash_map = HashMap() split_line = current_line.split(' ') if '\n' in split_line: split_line.remove('\n') if '' in split_line: split_line.remove('') for word in self.strip_punctuation(split_line): if word in self.punctuation: continue map_index = current_hash_map.hash_string(word) current_hash_map.add_to_hash_table(map_index, (word, 1)) return current_hash_map