def __enter__(self): if self.source.is_gcs_source: # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio self._file = gcsio.GcsIO().open(self.source.file_path, 'rb') else: self._file = open(self.source.file_path, 'rb') # Determine the real end_offset. # If not specified it will be the length of the file. if self.end_offset is None: self._file.seek(0, os.SEEK_END) self.end_offset = self._file.tell() if self.start_offset is None: self.start_offset = 0 self.current_offset = self.start_offset if self.start_offset > 0: # Read one byte before. This operation will either consume a previous # newline if start_offset was at the beginning of a line or consume the # line if we were in the middle of it. Either way we get the read position # exactly where we wanted: at the begining of the first full line. self._file.seek(self.start_offset - 1) self.current_offset -= 1 line = self._file.readline() self.current_offset += len(line) else: self._file.seek(self.start_offset) # Initializing range tracker after start and end offsets are finalized. self.range_tracker = range_trackers.OffsetRangeTracker( self.start_offset, self.end_offset) return self
def test_split_at_offset(self): tracker = range_trackers.OffsetRangeTracker(100, 200) self.assertTrue(tracker.try_return_record_at(True, 110)) # Example positions we shouldn't split at, when last record starts at 110: self.assertFalse(tracker.try_split_at_position(109)) self.assertFalse(tracker.try_split_at_position(110)) self.assertFalse(tracker.try_split_at_position(200)) self.assertFalse(tracker.try_split_at_position(210)) # Example positions we *should* split at: self.assertTrue(copy.copy(tracker).try_split_at_position(111)) self.assertTrue(copy.copy(tracker).try_split_at_position(129)) self.assertTrue(copy.copy(tracker).try_split_at_position(130)) self.assertTrue(copy.copy(tracker).try_split_at_position(131)) self.assertTrue(copy.copy(tracker).try_split_at_position(150)) self.assertTrue(copy.copy(tracker).try_split_at_position(199)) # If we split at 170 and then at 150: self.assertTrue(tracker.try_split_at_position(170)) self.assertTrue(tracker.try_split_at_position(150)) # Should be able to return a record starting before the new stop offset. # Returning records starting at the same offset is ok. self.assertTrue(copy.copy(tracker).try_return_record_at(True, 135)) self.assertTrue(copy.copy(tracker).try_return_record_at(True, 135)) # Should be able to return a record starting right before the new stop # offset. self.assertTrue(copy.copy(tracker).try_return_record_at(True, 149)) # Should not be able to return a record starting at or after the new stop # offset. self.assertFalse(tracker.try_return_record_at(True, 150)) self.assertFalse(tracker.try_return_record_at(True, 151)) # Should accept non-splitpoint records starting after stop offset. self.assertTrue(tracker.try_return_record_at(False, 135)) self.assertTrue(tracker.try_return_record_at(False, 152)) self.assertTrue(tracker.try_return_record_at(False, 160)) self.assertTrue(tracker.try_return_record_at(False, 171))
def test_everything_with_unbounded_range(self): tracker = range_trackers.OffsetRangeTracker( 100, range_trackers.OffsetRangeTracker.OFFSET_INFINITY) self.assertTrue(tracker.try_return_record_at(True, 150)) self.assertTrue(tracker.try_return_record_at(True, 250)) # get_position_for_fraction_consumed should fail for an unbounded range with self.assertRaises(Exception): tracker.get_position_for_fraction_consumed(0.5)
def __init__(self, source): self._source = source # Index of the last item returned by InMemoryReader. # Initialized to None. self._current_index = None self._range_tracker = range_trackers.OffsetRangeTracker( self._source.start_index, self._source.end_index)
def test_get_fraction_consumed_sparse(self): tracker = range_trackers.OffsetRangeTracker(100, 200) self.assertEqual(0, tracker.fraction_consumed()) self.assertTrue(tracker.try_claim(110)) # Consumed positions through 110 = total 10 positions of 100 done. self.assertEqual(0.10, tracker.fraction_consumed()) self.assertTrue(tracker.try_claim(150)) self.assertEqual(0.50, tracker.fraction_consumed()) self.assertTrue(tracker.try_claim(195)) self.assertEqual(0.95, tracker.fraction_consumed())
def test_get_fraction_consumed_sparse(self): tracker = range_trackers.OffsetRangeTracker(100, 200) self.assertEqual(0, tracker.fraction_consumed) self.assertTrue(tracker.try_return_record_at(True, 110)) # Consumed positions through 110 = total 11 positions of 100. self.assertEqual(0.11, tracker.fraction_consumed) self.assertTrue(tracker.try_return_record_at(True, 150)) self.assertEqual(0.51, tracker.fraction_consumed) self.assertTrue(tracker.try_return_record_at(True, 195)) self.assertEqual(0.96, tracker.fraction_consumed)
def test_try_return_record_continuous_until_split_point(self): tracker = range_trackers.OffsetRangeTracker(9, 18) # Return records with gaps of 2; every 3rd record is a split point. self.assertTrue(tracker.try_claim(10)) tracker.set_current_position(12) tracker.set_current_position(14) self.assertTrue(tracker.try_claim(16)) # Out of range, but not a split point... tracker.set_current_position(18) tracker.set_current_position(20) # Out of range AND a split point. self.assertFalse(tracker.try_claim(22))
def test_try_return_record_continuous_until_split_point(self): tracker = range_trackers.OffsetRangeTracker(9, 18) # Return records with gaps of 2; every 3rd record is a split point. self.assertTrue(tracker.try_return_record_at(True, 10)) self.assertTrue(tracker.try_return_record_at(False, 12)) self.assertTrue(tracker.try_return_record_at(False, 14)) self.assertTrue(tracker.try_return_record_at(True, 16)) # Out of range, but not a split point... self.assertTrue(tracker.try_return_record_at(False, 18)) self.assertTrue(tracker.try_return_record_at(False, 20)) # Out of range AND a split point. self.assertFalse(tracker.try_return_record_at(True, 22))
def test_get_position_for_fraction_dense(self): # Represents positions 3, 4, 5. tracker = range_trackers.OffsetRangeTracker(3, 6) # [3, 3) represents 0.0 of [3, 6) self.assertEqual(3, tracker.position_at_fraction(0.0)) # [3, 4) represents up to 1/3 of [3, 6) self.assertEqual(4, tracker.position_at_fraction(1.0 / 6)) self.assertEqual(4, tracker.position_at_fraction(0.333)) # [3, 5) represents up to 2/3 of [3, 6) self.assertEqual(5, tracker.position_at_fraction(0.334)) self.assertEqual(5, tracker.position_at_fraction(0.666)) # Any fraction consumed over 2/3 means the whole [3, 6) has been consumed. self.assertEqual(6, tracker.position_at_fraction(0.667))
def test_get_fraction_consumed_dense(self): tracker = range_trackers.OffsetRangeTracker(3, 6) self.assertEqual(0, tracker.fraction_consumed()) self.assertTrue(tracker.try_claim(3)) self.assertEqual(0.0, tracker.fraction_consumed()) self.assertTrue(tracker.try_claim(4)) self.assertEqual(1.0 / 3, tracker.fraction_consumed()) self.assertTrue(tracker.try_claim(5)) self.assertEqual(2.0 / 3, tracker.fraction_consumed()) tracker.set_current_position(6) self.assertEqual(1.0, tracker.fraction_consumed()) tracker.set_current_position(7) self.assertFalse(tracker.try_claim(7))
def test_get_fraction_consumed_dense(self): tracker = range_trackers.OffsetRangeTracker(3, 6) self.assertEqual(0, tracker.fraction_consumed) self.assertTrue(tracker.try_return_record_at(True, 3)) self.assertEqual(1.0 / 3, tracker.fraction_consumed) self.assertTrue(tracker.try_return_record_at(True, 4)) self.assertEqual(2.0 / 3, tracker.fraction_consumed) self.assertTrue(tracker.try_return_record_at(True, 5)) self.assertEqual(1.0, tracker.fraction_consumed) self.assertTrue(tracker.try_return_record_at(False, 6)) # non-split-point self.assertTrue(tracker.try_return_record_at(False, 7)) # non-split-point self.assertFalse(tracker.try_return_record_at(True, 7))
def get_range_tracker(self, start_position, stop_position): if self._test_range_tracker_fn: return self._test_range_tracker_fn() else: return range_trackers.OffsetRangeTracker(start_position, stop_position)
def test_split_at_offset_fails_if_unstarted(self): tracker = range_trackers.OffsetRangeTracker(100, 200) self.assertFalse(tracker.try_split_at_position(150))
def test_try_return_first_record_not_split_point(self): with self.assertRaises(Exception): range_trackers.OffsetRangeTracker(100, 200).set_current_position(120)
def test_try_return_record_simple_dense(self): tracker = range_trackers.OffsetRangeTracker(3, 6) self.assertTrue(tracker.try_return_record_at(True, 3)) self.assertTrue(tracker.try_return_record_at(True, 4)) self.assertTrue(tracker.try_return_record_at(True, 5)) self.assertFalse(tracker.try_return_record_at(True, 6))
def test_try_return_record_non_monotonic(self): tracker = range_trackers.OffsetRangeTracker(100, 200) self.assertTrue(tracker.try_claim(120)) with self.assertRaises(Exception): tracker.try_claim(110)
def test_try_return_record_simple_sparse(self): tracker = range_trackers.OffsetRangeTracker(100, 200) self.assertTrue(tracker.try_return_record_at(True, 110)) self.assertTrue(tracker.try_return_record_at(True, 140)) self.assertTrue(tracker.try_return_record_at(True, 183)) self.assertFalse(tracker.try_return_record_at(True, 210))
def test_try_return_record_non_monotonic(self): tracker = range_trackers.OffsetRangeTracker(100, 200) tracker.try_return_record_at(True, 120) with self.assertRaises(Exception): tracker.try_return_record_at(True, 110)
def test_try_return_first_record_not_split_point(self): with self.assertRaises(Exception): range_trackers.OffsetRangeTracker(100, 200).try_return_record_at( False, 120)
def test_try_return_record_simple_dense(self): tracker = range_trackers.OffsetRangeTracker(3, 6) self.assertTrue(tracker.try_claim(3)) self.assertTrue(tracker.try_claim(4)) self.assertTrue(tracker.try_claim(5)) self.assertFalse(tracker.try_claim(6))
def create_dummy_tracker(): return range_trackers.OffsetRangeTracker(0, 3)