def test_single_file_merge(self): data = IncrementalKeyValueIterator(9, 9, 2, 3, 1) data_copy = copy.deepcopy(data) tmp_filename = "data/single_merge" GroupByStatement.write_key_values_to_file(compute_hashmap(data), tmp_filename) m = MergeFileIterator([tmp_filename]) self.compare_outputs(data_copy, m)
def test_empty_stream(self): g = GroupByStatement(max_num_files=10, max_hashmap_entries=1000, request_id="test_empty_stream") data = IncrementalKeyValueIterator(0, 1, 0) result = g.groupBy(data) self.assertEqual(g.spills, 0) self.assertEqual(result.hasNext(), False)
def test_chunk_input_into_dump_files(self): g = GroupByStatement(max_num_files=100, max_hashmap_entries=100000, request_id="test_chunk_input_into_dump_files") data = IncrementalKeyValueIterator(100000, 10, 7) data_copy = copy.deepcopy(data) result_hashmap = g._chunk_input_into_dump_files(data) self.assertEqual(sorted(result_hashmap.items()), compute_hashmap(data_copy))
def test_stream_spills_on_disk(self): g = GroupByStatement(max_num_files=4, max_hashmap_entries=300, request_id="test_stream_spills_on_disk") data = IncrementalKeyValueIterator(1000, 10, 7) data_copy = copy.deepcopy(data) result_iterator = g.groupBy(data) self.assertEqual(g.spills, 4) self.compare_outputs(data_copy, result_iterator)
def test_low_memory(self): g = GroupByStatement(max_memory=1024, request_id="test_low_memory") data = IncrementalKeyValueIterator(1000, 10, 7) data_copy = copy.deepcopy(data) result_iterator = g.groupBy(data) self.assertTrue(g.spills > 0) self.assertTrue(g.num_merge_stages > 0) self.assertTrue(g._num_files <= 1000) self.compare_outputs(data_copy, result_iterator)
def test_large_stream(self): g = GroupByStatement(max_num_files=100, max_hashmap_entries=10000, request_id="test_large_stream") data = IncrementalKeyValueIterator(200000, 10, 7, 3, 2) data_copy = copy.deepcopy(data) result_iterator = g.groupBy(data) self.assertEqual(g.spills, 20) self.assertEqual(g._num_files, 20) self.compare_outputs(data_copy, result_iterator)
def test_stream_spills_on_disk_and_file_merges_required(self): g = GroupByStatement(max_num_files=2, max_hashmap_entries=100, request_id="test_stream_spills_on_disk_and_file_merges_required") data = IncrementalKeyValueIterator(1000, 10, 7) data_copy = copy.deepcopy(data) result_iterator = g.groupBy(data) self.assertEqual(g.spills, 10) self.assertEqual(g.num_merge_stages, 3) self.assertEqual(g._num_files, 2) self.compare_outputs(data_copy, result_iterator)
def test_consecutive_calls(self): g = GroupByStatement(max_num_files=2, max_hashmap_entries=1) result_iterator_list = [] request_id_list = [] for request_id in range(10): data = IncrementalKeyValueIterator(10, 3, 3) result_iterator_list.append(g.groupBy(data)) request_id_list.append(g._request_id) for index in range(10): # Exhaust iterator for key, value in result_iterator_list[index]: pass self.assertFalse(os.path.isdir(request_id_list[index]))
def test_multi_file_merge(self): num_files = 30 entries_per_file = [2 * index + 1 for index in range(num_files)] N = sum(entries_per_file) data = IncrementalKeyValueIterator(N, 23, 11, 11, 2) data_copy = copy.deepcopy(data) filenames = [] for index in range(num_files): tmp_filename = "data/multi_merge_{}".format(index) filenames.append(tmp_filename) file_content = defaultdict(list) for num_entries in range(entries_per_file[index]): key, value = next(data) file_content[key].append(value) GroupByStatement.write_key_values_to_file([(key, file_content[key]) for key in sorted(file_content.keys())], tmp_filename) m = MergeFileIterator(filenames) self.compare_outputs(data_copy, m)