def test_single_file_merge(self): data = IncrementalKeyValueIterator(9, 9, 2, 3, 1) data_copy = copy.deepcopy(data) tmp_filename = "data/single_merge" GroupByStatement.write_key_values_to_file(compute_hashmap(data), tmp_filename) m = MergeFileIterator([tmp_filename]) self.compare_outputs(data_copy, m)
def test_multi_file_merge(self): num_files = 30 entries_per_file = [2 * index + 1 for index in range(num_files)] N = sum(entries_per_file) data = IncrementalKeyValueIterator(N, 23, 11, 11, 2) data_copy = copy.deepcopy(data) filenames = [] for index in range(num_files): tmp_filename = "data/multi_merge_{}".format(index) filenames.append(tmp_filename) file_content = defaultdict(list) for num_entries in range(entries_per_file[index]): key, value = next(data) file_content[key].append(value) GroupByStatement.write_key_values_to_file([(key, file_content[key]) for key in sorted(file_content.keys())], tmp_filename) m = MergeFileIterator(filenames) self.compare_outputs(data_copy, m)