def test_split_points(self): file_name = self._write_data(count=12000) source = AvroSource(file_name) splits = [ split for split in source.split(desired_bundle_size=float('inf')) ] assert len(splits) == 1 range_tracker = splits[0].source.get_range_tracker( splits[0].start_position, splits[0].stop_position) split_points_report = [] for _ in splits[0].source.read(range_tracker): split_points_report.append(range_tracker.split_points()) # There are a total of three blocks. Each block has more than 10 records. # When reading records of the first block, range_tracker.split_points() # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN) self.assertEquals(split_points_report[:10], [(0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)] * 10) # When reading records of last block, range_tracker.split_points() should # return (2, 1) self.assertEquals(split_points_report[-10:], [(2, 1)] * 10)
def test_split_points(self): file_name = self._write_data(count=12000) source = AvroSource(file_name) splits = [ split for split in source.split(desired_bundle_size=float('inf')) ] assert len(splits) == 1 range_tracker = splits[0].source.get_range_tracker( splits[0].start_position, splits[0].stop_position) split_points_report = [] for _ in splits[0].source.read(range_tracker): split_points_report.append(range_tracker.split_points()) # There are a total of three blocks. Each block has more than 10 records. # When reading records of the first block, range_tracker.split_points() # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN) self.assertEquals( split_points_report[:10], [(0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)] * 10) # When reading records of last block, range_tracker.split_points() should # return (2, 1) self.assertEquals(split_points_report[-10:], [(2, 1)] * 10)
def test_read_reantrant_with_splitting(self): file_name = self._write_data() source = AvroSource(file_name) splits = [split for split in source.split(desired_bundle_size=100000)] assert len(splits) == 1 source_test_utils.assertReentrantReadsSucceed( (splits[0].source, splits[0].start_position, splits[0].stop_position))
def test_read_reantrant_with_splitting(self): file_name = self._write_data() source = AvroSource(file_name) splits = [ split for split in source.split(desired_bundle_size=100000)] assert len(splits) == 1 source_test_utils.assert_reentrant_reads_succeed( (splits[0].source, splits[0].start_position, splits[0].stop_position))
def test_dynamic_work_rebalancing_exhaustive(self): # Adjusting block size so that we can perform a exhaustive dynamic # work rebalancing test that completes within an acceptable amount of time. old_sync_interval = avro.datafile.SYNC_INTERVAL try: avro.datafile.SYNC_INTERVAL = 2 file_name = self._write_data(count=5) source = AvroSource(file_name) splits = [split for split in source.split(desired_bundle_size=float('inf'))] assert len(splits) == 1 source_test_utils.assert_split_at_fraction_exhaustive(splits[0].source) finally: avro.datafile.SYNC_INTERVAL = old_sync_interval
def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting, expected_result): source = AvroSource(pattern) read_records = [] if perform_splitting: assert desired_bundle_size splits = [ split for split in source.split( desired_bundle_size=desired_bundle_size) ] if len(splits) < 2: raise ValueError( 'Test is trivial. Please adjust it so that at least ' 'two splits get generated') sources_info = [(split.source, split.start_position, split.stop_position) for split in splits] source_test_utils.assertSourcesEqualReferenceSource( (source, None, None), sources_info) else: read_records = source_test_utils.readFromSource(source, None, None) self.assertItemsEqual(expected_result, read_records)
def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting, expected_result): source = AvroSource(pattern) read_records = [] if perform_splitting: assert desired_bundle_size splits = [ split for split in source.split(desired_bundle_size=desired_bundle_size) ] if len(splits) < 2: raise ValueError('Test is trivial. Please adjust it so that at least ' 'two splits get generated') sources_info = [ (split.source, split.start_position, split.stop_position) for split in splits ] source_test_utils.assert_sources_equal_reference_source( (source, None, None), sources_info) else: read_records = source_test_utils.read_from_source(source, None, None) self.assertItemsEqual(expected_result, read_records)