def test_single_file_merge(self):
        data = IncrementalKeyValueIterator(9, 9, 2, 3, 1)
        data_copy = copy.deepcopy(data)

        tmp_filename = "data/single_merge"

        GroupByStatement.write_key_values_to_file(compute_hashmap(data), tmp_filename)

        m = MergeFileIterator([tmp_filename])
        self.compare_outputs(data_copy, m)
    def test_multi_file_merge(self):
        num_files = 30
        entries_per_file = [2 * index + 1 for index in range(num_files)]
        N = sum(entries_per_file)

        data = IncrementalKeyValueIterator(N, 23, 11, 11, 2)
        data_copy = copy.deepcopy(data)

        filenames = []

        for index in range(num_files):
            tmp_filename = "data/multi_merge_{}".format(index)
            filenames.append(tmp_filename)

            file_content = defaultdict(list)
            for num_entries in range(entries_per_file[index]):
                key, value = next(data)
                file_content[key].append(value)
            GroupByStatement.write_key_values_to_file([(key, file_content[key]) for key in sorted(file_content.keys())],
                                                      tmp_filename)

        m = MergeFileIterator(filenames)
        self.compare_outputs(data_copy, m)