def test_min_bundle_size(self): file_name = self._write_data(count=120, row_group_size=20) source = _create_parquet_source(file_name, min_bundle_size=100 * 1024 * 1024) splits = [split for split in source.split(desired_bundle_size=1)] self.assertEqual(len(splits), 1) source = _create_parquet_source(file_name, min_bundle_size=0) splits = [split for split in source.split(desired_bundle_size=1)] self.assertNotEqual(len(splits), 1)
def test_split_points(self): file_name = self._write_data(count=12000, row_group_size=3000) source = _create_parquet_source(file_name) splits = [ split for split in source.split(desired_bundle_size=float('inf')) ] assert len(splits) == 1 range_tracker = splits[0].source.get_range_tracker( splits[0].start_position, splits[0].stop_position) split_points_report = [] for _ in splits[0].source.read(range_tracker): split_points_report.append(range_tracker.split_points()) # There are a total of four row groups. Each row group has 3000 records. # When reading records of the first group, range_tracker.split_points() # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN) self.assertEqual(split_points_report, [ (0, RangeTracker.SPLIT_POINTS_UNKNOWN), (1, RangeTracker.SPLIT_POINTS_UNKNOWN), (2, RangeTracker.SPLIT_POINTS_UNKNOWN), (3, 1), ])
def test_dynamic_work_rebalancing(self): file_name = self._write_data(count=120, row_group_size=20) source = _create_parquet_source(file_name) splits = [split for split in source.split(desired_bundle_size=float('inf'))] assert len(splits) == 1 source_test_utils.assert_split_at_fraction_exhaustive( splits[0].source, splits[0].start_position, splits[0].stop_position)
def test_source_display_data(self): file_name = 'some_parquet_source' source = \ _create_parquet_source( file_name, validate=False ) dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def _run_parquet_test(self, pattern, columns, desired_bundle_size, perform_splitting, expected_result): source = _create_parquet_source(pattern, columns=columns) if perform_splitting: assert desired_bundle_size sources_info = [ (split.source, split.start_position, split.stop_position) for split in source.split(desired_bundle_size=desired_bundle_size) ] if len(sources_info) < 2: raise ValueError('Test is trivial. Please adjust it so that at least ' 'two splits get generated') source_test_utils.assert_sources_equal_reference_source( (source, None, None), sources_info) else: read_records = source_test_utils.read_from_source(source, None, None) self.assertCountEqual(expected_result, read_records)
def test_read_reentrant(self): file_name = self._write_data(count=6, row_group_size=3) source = _create_parquet_source(file_name) source_test_utils.assert_reentrant_reads_succeed((source, None, None))