def test_try_split(self): tracker = OffsetRestrictionTracker(OffsetRange(100, 200)) tracker.try_claim(100) cur, residual = tracker.try_split(0.5) self.assertEqual(OffsetRange(100, 150), cur) self.assertEqual(OffsetRange(150, 200), residual) self.assertEqual(cur, tracker.current_restriction())
def test_create(self): OffsetRange(0, 10) OffsetRange(10, 10) OffsetRange(10, 100) with self.assertRaises(ValueError): OffsetRange(10, 9)
def test_split_no_small_split_at_end(self): range = OffsetRange(10, 90) splits = list(range.split(desired_num_offsets_per_split=25)) self.assertEqual(3, len(splits)) self.assertIn(OffsetRange(10, 35), splits) self.assertIn(OffsetRange(35, 60), splits) self.assertIn(OffsetRange(60, 90), splits)
def test_checkpoint_regular(self): tracker = OffsetRestrictionTracker(OffsetRange(100, 200)) self.assertTrue(tracker.try_claim(105)) self.assertTrue(tracker.try_claim(110)) _, checkpoint = tracker.try_split(0) self.assertEqual(OffsetRange(100, 111), tracker.current_restriction()) self.assertEqual(OffsetRange(111, 200), checkpoint)
def test_try_claim(self): tracker = OffsetRestrictionTracker(OffsetRange(100, 200)) self.assertEqual(OffsetRange(100, 200), tracker.current_restriction()) self.assertTrue(tracker.try_claim(100)) self.assertTrue(tracker.try_claim(150)) self.assertTrue(tracker.try_claim(199)) self.assertFalse(tracker.try_claim(200))
def testSyntheticStepSplitProviderUnevenChunks(self): bundles = 4 provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider( 5, bundles, True, False, None) self.verify_random_splits(provider, OffsetRange(4, 10), bundles) self.verify_random_splits(provider, OffsetRange(4, 4), 0) self.verify_random_splits(provider, OffsetRange(0, 1), 1) self.verify_random_splits(provider, OffsetRange(0, bundles - 2), bundles)
def test_split_respects_min_num_splits(self): range = OffsetRange(10, 100) splits = list(range.split(desired_num_offsets_per_split=5, min_num_offsets_per_split=25)) self.assertEqual(3, len(splits)) self.assertIn(OffsetRange(10, 35), splits) self.assertIn(OffsetRange(35, 60), splits) self.assertIn(OffsetRange(60, 100), splits)
def test_checkpoint_claimed_last(self): tracker = OffsetRestrictionTracker(OffsetRange(100, 200)) self.assertTrue(tracker.try_claim(105)) self.assertTrue(tracker.try_claim(110)) self.assertTrue(tracker.try_claim(199)) checkpoint = tracker.try_split(0) self.assertEqual(OffsetRange(100, 200), tracker.current_restriction()) self.assertEqual(None, checkpoint)
def test_checkpoint_after_failed_claim(self): tracker = OffsetRestrictionTracker(OffsetRange(100, 200)) self.assertTrue(tracker.try_claim(105)) self.assertTrue(tracker.try_claim(110)) self.assertTrue(tracker.try_claim(160)) self.assertFalse(tracker.try_claim(240)) self.assertIsNone(tracker.try_split(0)) self.assertTrue(OffsetRange(100, 200), tracker.current_restriction())
def test_self_checkpoint_immediately(self): restriction_tracker = OffsetRestrictionTracker(OffsetRange(0, 10)) threadsafe_tracker = ThreadsafeRestrictionTracker(restriction_tracker) threadsafe_tracker.defer_remainder() deferred_residual, deferred_time = threadsafe_tracker.deferred_status() expected_residual = OffsetRange(0, 10) self.assertEqual(deferred_residual, expected_residual) self.assertTrue(isinstance(deferred_time, timestamp.Duration)) self.assertEqual(deferred_time, 0)
def create_split_across_windows(self, primary_windows, residual_windows): primary = SplitResultPrimary(primary_value=WindowedValue(( ('a', (OffsetRange(0, 100), self.watermark_estimator_state)), 100), 57, primary_windows)) if primary_windows else None residual = SplitResultResidual( residual_value=WindowedValue( (('a', (OffsetRange(0, 100), self.watermark_estimator_state)), 100), 57, residual_windows), current_watermark=None, deferred_timestamp=None) if residual_windows else None return primary, residual
def test_api_expose(self): threadsafe_tracker = iobase.ThreadsafeRestrictionTracker( OffsetRestrictionTracker(OffsetRange(0, 10))) tracker_view = iobase.RestrictionTrackerView(threadsafe_tracker) current_restriction = tracker_view.current_restriction() self.assertEqual(current_restriction, OffsetRange(0, 10)) self.assertTrue(tracker_view.try_claim(0)) tracker_view.defer_remainder() deferred_remainder, deferred_watermark = ( threadsafe_tracker.deferred_status()) self.assertEqual(deferred_remainder, OffsetRange(1, 10)) self.assertEqual(deferred_watermark, timestamp.Duration())
def create_split_in_window(self, offset_index, windows): return ( SplitResultPrimary(primary_value=WindowedValue((('a', ( OffsetRange(0, offset_index), self.watermark_estimator_state)), offset_index), 57, windows)), SplitResultResidual( residual_value=WindowedValue( (('a', (OffsetRange(offset_index, 100), self.watermark_estimator.get_estimator_state())), 100 - offset_index), 57, windows), current_watermark=self.watermark_estimator.current_watermark(), deferred_timestamp=None))
def test_synthetic_step_split_provider_no_liquid_sharding(self): # Verify Liquid Sharding Works provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider( 5, 5, True, False, None) tracker = provider.create_tracker(OffsetRange(1, 6)) tracker.try_claim(2) self.assertEqual(tracker.try_split(.5), (OffsetRange(1, 4), OffsetRange(4, 6))) # Verify No Liquid Sharding provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider( 5, 5, True, True, None) tracker = provider.create_tracker(OffsetRange(1, 6)) tracker.try_claim(2) self.assertEqual(tracker.try_split(3), None)
def test_check_done_when_not_done(self): tracker = OffsetRestrictionTracker(OffsetRange(100, 200)) self.assertTrue(tracker.try_claim(150)) self.assertTrue(tracker.try_claim(175)) with self.assertRaises(ValueError): tracker.check_done()
def process(self, element, *args, **kwargs): match_results = FileSystems.match([element]) for metadata in match_results[0].metadata_list: splittable = (self._splittable and _determine_splittability_from_compression_type( metadata.path, self._compression_type)) if splittable: for split in OffsetRange(0, metadata.size_in_bytes).split( self._desired_bundle_size, self._min_bundle_size): yield (metadata, split) else: yield (metadata, OffsetRange( 0, range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
def split(self, element, restriction): bundle_ranges = [] start_position = restriction.start stop_position = restriction.stop element_size = element['key_size'] + element['value_size'] estimate_size = element_size * element['num_records'] if element['initial_splitting'] == 'zipf': desired_num_bundles = ( element['initial_splitting_num_bundles'] or div_round_up( estimate_size, element['initial_splitting_desired_bundle_size'])) samples = np.random.zipf( element['initial_splitting_distribution_parameter'], desired_num_bundles) total = sum(samples) relative_bundle_sizes = [(float(sample) / total) for sample in samples] start = start_position index = 0 while start < stop_position: if index == desired_num_bundles - 1: bundle_ranges.append(OffsetRange(start, stop_position)) break stop = start + int( element['num_records'] * relative_bundle_sizes[index]) bundle_ranges.append(OffsetRange(start, stop)) start = stop index += 1 else: if element['initial_splitting_num_bundles']: bundle_size_in_elements = max( 1, int( element['num_records'] / element['initial_splitting_num_bundles'])) else: bundle_size_in_elements = ( max( div_round_up( element['initial_splitting_desired_bundle_size'], element_size), int(math.floor(math.sqrt(element['num_records']))))) for start in range(start_position, stop_position, bundle_size_in_elements): stop = min(start + bundle_size_in_elements, stop_position) bundle_ranges.append(OffsetRange(start, stop)) return bundle_ranges
def test_self_checkpoint_with_relative_time(self): threadsafe_tracker = iobase.ThreadsafeRestrictionTracker( OffsetRestrictionTracker(OffsetRange(0, 10))) threadsafe_tracker.defer_remainder(timestamp.Duration(100)) time.sleep(2) _, deferred_time = threadsafe_tracker.deferred_status() self.assertTrue(isinstance(deferred_time, timestamp.Duration)) # The expectation = 100 - 2 - some_delta self.assertTrue(deferred_time <= 98)
def test_split_respects_desired_num_splits(self): range = OffsetRange(10, 100) splits = list(range.split(desired_num_offsets_per_split=25)) self.assertEqual(4, len(splits)) self.assertIn(OffsetRange(10, 35), splits) self.assertIn(OffsetRange(35, 60), splits) self.assertIn(OffsetRange(60, 85), splits) self.assertIn(OffsetRange(85, 100), splits)
def setUp(self): self.window1 = IntervalWindow(0, 10) self.window2 = IntervalWindow(10, 20) self.window3 = IntervalWindow(20, 30) self.windowed_value = WindowedValue( 'a', 57, (self.window1, self.window2, self.window3)) self.restriction = OffsetRange(0, 100) self.watermark_estimator_state = Timestamp(21) self.restriction_provider = TestOffsetRestrictionProvider() self.watermark_estimator = ManualWatermarkEstimator(Timestamp(42)) self.maxDiff = None
def initial_restriction(self, element): start, end, interval = element if isinstance(start, Timestamp): start = start.micros / 1000000 if isinstance(end, Timestamp): end = end.micros / 1000000 assert start <= end assert interval > 0 total_outputs = math.ceil((end - start) / interval) return OffsetRange(0, total_outputs)
def test_non_expose_apis(self): threadsafe_tracker = iobase.ThreadsafeRestrictionTracker( OffsetRestrictionTracker(OffsetRange(0, 10))) tracker_view = iobase.RestrictionTrackerView(threadsafe_tracker) with self.assertRaises(AttributeError): tracker_view.check_done() with self.assertRaises(AttributeError): tracker_view.current_progress() with self.assertRaises(AttributeError): tracker_view.try_split() with self.assertRaises(AttributeError): tracker_view.deferred_status()
def test_self_checkpoint_with_absolute_time(self): threadsafe_tracker = iobase.ThreadsafeRestrictionTracker( OffsetRestrictionTracker(OffsetRange(0, 10))) now = timestamp.Timestamp.now() schedule_time = now + timestamp.Duration(100) self.assertTrue(isinstance(schedule_time, timestamp.Timestamp)) threadsafe_tracker.defer_remainder(schedule_time) time.sleep(2) _, deferred_time = threadsafe_tracker.deferred_status() self.assertTrue(isinstance(deferred_time, timestamp.Duration)) # The expectation = # schedule_time - the time when deferred_status is called - some_delta self.assertTrue(deferred_time <= 98)
def split(self, element, restriction): elems = restriction.size() if (self._initial_splitting_uneven_chunks and self._initial_splitting_num_bundles > 1 and elems > 1): bundle_ranges = initial_splitting_zipf( restriction.start, restriction.stop, self._initial_splitting_num_bundles, 3.0) for start, stop in bundle_ranges: yield OffsetRange(start, stop) else: offsets_per_split = max(1, (elems // self._initial_splitting_num_bundles)) for split in restriction.split(offsets_per_split, offsets_per_split // 2): yield split
def split(self, element, restriction): elems = restriction[1] - restriction[0] if (self._initial_splitting_uneven_chunks and self._initial_splitting_num_bundles > 1 and elems > 1): return initial_splitting_zipf(restriction[0], restriction[1], self._initial_splitting_num_bundles, 3.0) else: offsets_per_split = max( 1, (elems // self._initial_splitting_num_bundles)) result = list( OffsetRange(restriction[0], restriction[1]).split(offsets_per_split, offsets_per_split // 2)) return [(x.start, x.stop) for x in result]
def split(self, desired_bundle_size, start_offset=None, stop_offset=None): if start_offset is None: start_offset = self._start_offset if stop_offset is None: stop_offset = self._stop_offset if self._splittable: splits = OffsetRange(start_offset, stop_offset).split( desired_bundle_size, self._min_bundle_size) for split in splits: yield iobase.SourceBundle( split.stop - split.start, _SingleFileSource( # Copying this so that each sub-source gets a fresh instance. pickler.loads(pickler.dumps(self._file_based_source)), self._file_name, split.start, split.stop, min_bundle_size=self._min_bundle_size, splittable=self._splittable), split.start, split.stop) else: # Returning a single sub-source with end offset set to OFFSET_INFINITY (so # that all data of the source gets read) since this source is # unsplittable. Choosing size of the file as end offset will be wrong for # certain unsplittable source, e.g., compressed sources. yield iobase.SourceBundle( stop_offset - start_offset, _SingleFileSource( self._file_based_source, self._file_name, start_offset, range_trackers.OffsetRangeTracker.OFFSET_INFINITY, min_bundle_size=self._min_bundle_size, splittable=self._splittable ), start_offset, range_trackers.OffsetRangeTracker.OFFSET_INFINITY )
def test_synthetic_step_split_provider(self): provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider( 5, 2, False, False, None) self.assertEqual( list(provider.split('ab', OffsetRange(2, 15))), [OffsetRange(2, 8), OffsetRange(8, 15)]) self.assertEqual(list(provider.split('ab', OffsetRange( 0, 8))), [OffsetRange(0, 4), OffsetRange(4, 8)]) self.assertEqual(list(provider.split('ab', OffsetRange(0, 0))), []) self.assertEqual(list(provider.split('ab', OffsetRange(2, 3))), [OffsetRange(2, 3)]) provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider( 10, 1, False, False, None) self.assertEqual(list(provider.split('ab', OffsetRange(1, 10))), [OffsetRange(1, 10)]) self.assertEqual(provider.restriction_size('ab', OffsetRange(1, 10)), 9 * 2) provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider( 10, 3, False, False, None) self.assertEqual( list(provider.split('ab', OffsetRange(1, 10))), [OffsetRange(1, 4), OffsetRange(4, 7), OffsetRange(7, 10)]) self.assertEqual(provider.initial_restriction('a'), OffsetRange(0, 10)) provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider( 10, 3, False, False, 45) self.assertEqual(provider.restriction_size('ab', OffsetRange(1, 3)), 45) tracker = provider.create_tracker(OffsetRange(1, 6)) tracker.try_claim(1) # Claim to allow splitting. self.assertEqual(tracker.try_split(.5), (OffsetRange(1, 3), OffsetRange(3, 6)))
def test_initialization(self): with self.assertRaises(ValueError): iobase.RestrictionTrackerView( OffsetRestrictionTracker(OffsetRange(0, 10)))
def initial_restriction(self, element): return OffsetRange(0, element['num_records'])
def initial_restriction(self, element): size = os.path.getsize(element) return OffsetRange(0, size)
def initial_restriction(self, element): return OffsetRange(0, len(element[0]))