def test_try_split(self):
   tracker = OffsetRestrictionTracker(OffsetRange(100, 200))
   tracker.try_claim(100)
   cur, residual = tracker.try_split(0.5)
   self.assertEqual(OffsetRange(100, 150), cur)
   self.assertEqual(OffsetRange(150, 200), residual)
   self.assertEqual(cur, tracker.current_restriction())
  def test_create(self):
    OffsetRange(0, 10)
    OffsetRange(10, 10)
    OffsetRange(10, 100)

    with self.assertRaises(ValueError):
      OffsetRange(10, 9)
 def test_split_no_small_split_at_end(self):
   range = OffsetRange(10, 90)
   splits = list(range.split(desired_num_offsets_per_split=25))
   self.assertEqual(3, len(splits))
   self.assertIn(OffsetRange(10, 35), splits)
   self.assertIn(OffsetRange(35, 60), splits)
   self.assertIn(OffsetRange(60, 90), splits)
 def test_checkpoint_regular(self):
   tracker = OffsetRestrictionTracker(OffsetRange(100, 200))
   self.assertTrue(tracker.try_claim(105))
   self.assertTrue(tracker.try_claim(110))
   _, checkpoint = tracker.try_split(0)
   self.assertEqual(OffsetRange(100, 111), tracker.current_restriction())
   self.assertEqual(OffsetRange(111, 200), checkpoint)
 def test_try_claim(self):
   tracker = OffsetRestrictionTracker(OffsetRange(100, 200))
   self.assertEqual(OffsetRange(100, 200), tracker.current_restriction())
   self.assertTrue(tracker.try_claim(100))
   self.assertTrue(tracker.try_claim(150))
   self.assertTrue(tracker.try_claim(199))
   self.assertFalse(tracker.try_claim(200))
Example #6
0
 def testSyntheticStepSplitProviderUnevenChunks(self):
   bundles = 4
   provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider(
       5, bundles, True, False, None)
   self.verify_random_splits(provider, OffsetRange(4, 10), bundles)
   self.verify_random_splits(provider, OffsetRange(4, 4), 0)
   self.verify_random_splits(provider, OffsetRange(0, 1), 1)
   self.verify_random_splits(provider, OffsetRange(0, bundles - 2), bundles)
 def test_split_respects_min_num_splits(self):
   range = OffsetRange(10, 100)
   splits = list(range.split(desired_num_offsets_per_split=5,
                             min_num_offsets_per_split=25))
   self.assertEqual(3, len(splits))
   self.assertIn(OffsetRange(10, 35), splits)
   self.assertIn(OffsetRange(35, 60), splits)
   self.assertIn(OffsetRange(60, 100), splits)
 def test_checkpoint_claimed_last(self):
   tracker = OffsetRestrictionTracker(OffsetRange(100, 200))
   self.assertTrue(tracker.try_claim(105))
   self.assertTrue(tracker.try_claim(110))
   self.assertTrue(tracker.try_claim(199))
   checkpoint = tracker.try_split(0)
   self.assertEqual(OffsetRange(100, 200), tracker.current_restriction())
   self.assertEqual(None, checkpoint)
  def test_checkpoint_after_failed_claim(self):
    tracker = OffsetRestrictionTracker(OffsetRange(100, 200))
    self.assertTrue(tracker.try_claim(105))
    self.assertTrue(tracker.try_claim(110))
    self.assertTrue(tracker.try_claim(160))
    self.assertFalse(tracker.try_claim(240))

    self.assertIsNone(tracker.try_split(0))
    self.assertTrue(OffsetRange(100, 200), tracker.current_restriction())
Example #10
0
 def test_self_checkpoint_immediately(self):
   restriction_tracker = OffsetRestrictionTracker(OffsetRange(0, 10))
   threadsafe_tracker = ThreadsafeRestrictionTracker(restriction_tracker)
   threadsafe_tracker.defer_remainder()
   deferred_residual, deferred_time = threadsafe_tracker.deferred_status()
   expected_residual = OffsetRange(0, 10)
   self.assertEqual(deferred_residual, expected_residual)
   self.assertTrue(isinstance(deferred_time, timestamp.Duration))
   self.assertEqual(deferred_time, 0)
Example #11
0
 def create_split_across_windows(self, primary_windows, residual_windows):
     primary = SplitResultPrimary(primary_value=WindowedValue((
         ('a', (OffsetRange(0, 100), self.watermark_estimator_state)),
         100), 57, primary_windows)) if primary_windows else None
     residual = SplitResultResidual(
         residual_value=WindowedValue(
             (('a', (OffsetRange(0, 100), self.watermark_estimator_state)),
              100), 57, residual_windows),
         current_watermark=None,
         deferred_timestamp=None) if residual_windows else None
     return primary, residual
Example #12
0
 def test_api_expose(self):
     threadsafe_tracker = iobase.ThreadsafeRestrictionTracker(
         OffsetRestrictionTracker(OffsetRange(0, 10)))
     tracker_view = iobase.RestrictionTrackerView(threadsafe_tracker)
     current_restriction = tracker_view.current_restriction()
     self.assertEqual(current_restriction, OffsetRange(0, 10))
     self.assertTrue(tracker_view.try_claim(0))
     tracker_view.defer_remainder()
     deferred_remainder, deferred_watermark = (
         threadsafe_tracker.deferred_status())
     self.assertEqual(deferred_remainder, OffsetRange(1, 10))
     self.assertEqual(deferred_watermark, timestamp.Duration())
Example #13
0
 def create_split_in_window(self, offset_index, windows):
     return (
         SplitResultPrimary(primary_value=WindowedValue((('a', (
             OffsetRange(0, offset_index),
             self.watermark_estimator_state)), offset_index), 57, windows)),
         SplitResultResidual(
             residual_value=WindowedValue(
                 (('a', (OffsetRange(offset_index, 100),
                         self.watermark_estimator.get_estimator_state())),
                  100 - offset_index), 57, windows),
             current_watermark=self.watermark_estimator.current_watermark(),
             deferred_timestamp=None))
Example #14
0
    def test_synthetic_step_split_provider_no_liquid_sharding(self):
        # Verify Liquid Sharding Works
        provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider(
            5, 5, True, False, None)
        tracker = provider.create_tracker(OffsetRange(1, 6))
        tracker.try_claim(2)
        self.assertEqual(tracker.try_split(.5),
                         (OffsetRange(1, 4), OffsetRange(4, 6)))

        # Verify No Liquid Sharding
        provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider(
            5, 5, True, True, None)
        tracker = provider.create_tracker(OffsetRange(1, 6))
        tracker.try_claim(2)
        self.assertEqual(tracker.try_split(3), None)
  def test_check_done_when_not_done(self):
    tracker = OffsetRestrictionTracker(OffsetRange(100, 200))
    self.assertTrue(tracker.try_claim(150))
    self.assertTrue(tracker.try_claim(175))

    with self.assertRaises(ValueError):
      tracker.check_done()
Example #16
0
    def process(self, element, *args, **kwargs):
        match_results = FileSystems.match([element])
        for metadata in match_results[0].metadata_list:
            splittable = (self._splittable
                          and _determine_splittability_from_compression_type(
                              metadata.path, self._compression_type))

            if splittable:
                for split in OffsetRange(0, metadata.size_in_bytes).split(
                        self._desired_bundle_size, self._min_bundle_size):
                    yield (metadata, split)
            else:
                yield (metadata,
                       OffsetRange(
                           0,
                           range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
Example #17
0
 def split(self, element, restriction):
   bundle_ranges = []
   start_position = restriction.start
   stop_position = restriction.stop
   element_size = element['key_size'] + element['value_size']
   estimate_size = element_size * element['num_records']
   if element['initial_splitting'] == 'zipf':
     desired_num_bundles = (
         element['initial_splitting_num_bundles'] or div_round_up(
             estimate_size, element['initial_splitting_desired_bundle_size']))
     samples = np.random.zipf(
         element['initial_splitting_distribution_parameter'],
         desired_num_bundles)
     total = sum(samples)
     relative_bundle_sizes = [(float(sample) / total) for sample in samples]
     start = start_position
     index = 0
     while start < stop_position:
       if index == desired_num_bundles - 1:
         bundle_ranges.append(OffsetRange(start, stop_position))
         break
       stop = start + int(
           element['num_records'] * relative_bundle_sizes[index])
       bundle_ranges.append(OffsetRange(start, stop))
       start = stop
       index += 1
   else:
     if element['initial_splitting_num_bundles']:
       bundle_size_in_elements = max(
           1,
           int(
               element['num_records'] /
               element['initial_splitting_num_bundles']))
     else:
       bundle_size_in_elements = (
           max(
               div_round_up(
                   element['initial_splitting_desired_bundle_size'],
                   element_size),
               int(math.floor(math.sqrt(element['num_records'])))))
     for start in range(start_position, stop_position,
                        bundle_size_in_elements):
       stop = min(start + bundle_size_in_elements, stop_position)
       bundle_ranges.append(OffsetRange(start, stop))
   return bundle_ranges
Example #18
0
 def test_self_checkpoint_with_relative_time(self):
     threadsafe_tracker = iobase.ThreadsafeRestrictionTracker(
         OffsetRestrictionTracker(OffsetRange(0, 10)))
     threadsafe_tracker.defer_remainder(timestamp.Duration(100))
     time.sleep(2)
     _, deferred_time = threadsafe_tracker.deferred_status()
     self.assertTrue(isinstance(deferred_time, timestamp.Duration))
     # The expectation = 100 - 2 - some_delta
     self.assertTrue(deferred_time <= 98)
 def test_split_respects_desired_num_splits(self):
   range = OffsetRange(10, 100)
   splits = list(range.split(desired_num_offsets_per_split=25))
   self.assertEqual(4, len(splits))
   self.assertIn(OffsetRange(10, 35), splits)
   self.assertIn(OffsetRange(35, 60), splits)
   self.assertIn(OffsetRange(60, 85), splits)
   self.assertIn(OffsetRange(85, 100), splits)
Example #20
0
 def setUp(self):
     self.window1 = IntervalWindow(0, 10)
     self.window2 = IntervalWindow(10, 20)
     self.window3 = IntervalWindow(20, 30)
     self.windowed_value = WindowedValue(
         'a', 57, (self.window1, self.window2, self.window3))
     self.restriction = OffsetRange(0, 100)
     self.watermark_estimator_state = Timestamp(21)
     self.restriction_provider = TestOffsetRestrictionProvider()
     self.watermark_estimator = ManualWatermarkEstimator(Timestamp(42))
     self.maxDiff = None
Example #21
0
    def initial_restriction(self, element):
        start, end, interval = element
        if isinstance(start, Timestamp):
            start = start.micros / 1000000
        if isinstance(end, Timestamp):
            end = end.micros / 1000000

        assert start <= end
        assert interval > 0
        total_outputs = math.ceil((end - start) / interval)
        return OffsetRange(0, total_outputs)
Example #22
0
 def test_non_expose_apis(self):
     threadsafe_tracker = iobase.ThreadsafeRestrictionTracker(
         OffsetRestrictionTracker(OffsetRange(0, 10)))
     tracker_view = iobase.RestrictionTrackerView(threadsafe_tracker)
     with self.assertRaises(AttributeError):
         tracker_view.check_done()
     with self.assertRaises(AttributeError):
         tracker_view.current_progress()
     with self.assertRaises(AttributeError):
         tracker_view.try_split()
     with self.assertRaises(AttributeError):
         tracker_view.deferred_status()
 def test_split_no_small_split_at_end(self):
   range = OffsetRange(10, 90)
   splits = list(range.split(desired_num_offsets_per_split=25))
   self.assertEqual(3, len(splits))
   self.assertIn(OffsetRange(10, 35), splits)
   self.assertIn(OffsetRange(35, 60), splits)
   self.assertIn(OffsetRange(60, 90), splits)
Example #24
0
 def test_self_checkpoint_with_absolute_time(self):
     threadsafe_tracker = iobase.ThreadsafeRestrictionTracker(
         OffsetRestrictionTracker(OffsetRange(0, 10)))
     now = timestamp.Timestamp.now()
     schedule_time = now + timestamp.Duration(100)
     self.assertTrue(isinstance(schedule_time, timestamp.Timestamp))
     threadsafe_tracker.defer_remainder(schedule_time)
     time.sleep(2)
     _, deferred_time = threadsafe_tracker.deferred_status()
     self.assertTrue(isinstance(deferred_time, timestamp.Duration))
     # The expectation =
     # schedule_time - the time when deferred_status is called - some_delta
     self.assertTrue(deferred_time <= 98)
Example #25
0
  def split(self, element, restriction):
    elems = restriction.size()
    if (self._initial_splitting_uneven_chunks and
        self._initial_splitting_num_bundles > 1 and elems > 1):
      bundle_ranges = initial_splitting_zipf(
          restriction.start, restriction.stop,
          self._initial_splitting_num_bundles, 3.0)
      for start, stop in bundle_ranges:
        yield OffsetRange(start, stop)

    else:
      offsets_per_split = max(1, (elems // self._initial_splitting_num_bundles))
      for split in restriction.split(offsets_per_split, offsets_per_split // 2):
        yield split
Example #26
0
 def split(self, element, restriction):
     elems = restriction[1] - restriction[0]
     if (self._initial_splitting_uneven_chunks
             and self._initial_splitting_num_bundles > 1 and elems > 1):
         return initial_splitting_zipf(restriction[0], restriction[1],
                                       self._initial_splitting_num_bundles,
                                       3.0)
     else:
         offsets_per_split = max(
             1, (elems // self._initial_splitting_num_bundles))
         result = list(
             OffsetRange(restriction[0],
                         restriction[1]).split(offsets_per_split,
                                               offsets_per_split // 2))
         return [(x.start, x.stop) for x in result]
  def split(self, desired_bundle_size, start_offset=None, stop_offset=None):
    if start_offset is None:
      start_offset = self._start_offset
    if stop_offset is None:
      stop_offset = self._stop_offset

    if self._splittable:
      splits = OffsetRange(start_offset, stop_offset).split(
          desired_bundle_size, self._min_bundle_size)
      for split in splits:
        yield iobase.SourceBundle(
            split.stop - split.start,
            _SingleFileSource(
                # Copying this so that each sub-source gets a fresh instance.
                pickler.loads(pickler.dumps(self._file_based_source)),
                self._file_name,
                split.start,
                split.stop,
                min_bundle_size=self._min_bundle_size,
                splittable=self._splittable),
            split.start,
            split.stop)
    else:
      # Returning a single sub-source with end offset set to OFFSET_INFINITY (so
      # that all data of the source gets read) since this source is
      # unsplittable. Choosing size of the file as end offset will be wrong for
      # certain unsplittable source, e.g., compressed sources.
      yield iobase.SourceBundle(
          stop_offset - start_offset,
          _SingleFileSource(
              self._file_based_source,
              self._file_name,
              start_offset,
              range_trackers.OffsetRangeTracker.OFFSET_INFINITY,
              min_bundle_size=self._min_bundle_size,
              splittable=self._splittable
          ),
          start_offset,
          range_trackers.OffsetRangeTracker.OFFSET_INFINITY
      )
Example #28
0
    def test_synthetic_step_split_provider(self):
        provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider(
            5, 2, False, False, None)

        self.assertEqual(
            list(provider.split('ab', OffsetRange(2, 15))),
            [OffsetRange(2, 8), OffsetRange(8, 15)])
        self.assertEqual(list(provider.split('ab', OffsetRange(
            0, 8))), [OffsetRange(0, 4), OffsetRange(4, 8)])
        self.assertEqual(list(provider.split('ab', OffsetRange(0, 0))), [])
        self.assertEqual(list(provider.split('ab', OffsetRange(2, 3))),
                         [OffsetRange(2, 3)])

        provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider(
            10, 1, False, False, None)
        self.assertEqual(list(provider.split('ab', OffsetRange(1, 10))),
                         [OffsetRange(1, 10)])
        self.assertEqual(provider.restriction_size('ab', OffsetRange(1, 10)),
                         9 * 2)

        provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider(
            10, 3, False, False, None)
        self.assertEqual(
            list(provider.split('ab', OffsetRange(1, 10))),
            [OffsetRange(1, 4),
             OffsetRange(4, 7),
             OffsetRange(7, 10)])
        self.assertEqual(provider.initial_restriction('a'), OffsetRange(0, 10))

        provider = synthetic_pipeline.SyntheticSDFStepRestrictionProvider(
            10, 3, False, False, 45)
        self.assertEqual(provider.restriction_size('ab', OffsetRange(1, 3)),
                         45)

        tracker = provider.create_tracker(OffsetRange(1, 6))
        tracker.try_claim(1)  # Claim to allow splitting.
        self.assertEqual(tracker.try_split(.5),
                         (OffsetRange(1, 3), OffsetRange(3, 6)))
Example #29
0
 def test_initialization(self):
     with self.assertRaises(ValueError):
         iobase.RestrictionTrackerView(
             OffsetRestrictionTracker(OffsetRange(0, 10)))
Example #30
0
 def initial_restriction(self, element):
   return OffsetRange(0, element['num_records'])
Example #31
0
 def initial_restriction(self, element):
     size = os.path.getsize(element)
     return OffsetRange(0, size)
Example #32
0
 def initial_restriction(self, element):
     return OffsetRange(0, len(element[0]))