Ejemplo n.º 1
0
    def __enter__(self):
        if self.source.is_gcs_source:
            # pylint: disable=g-import-not-at-top
            from google.cloud.dataflow.io import gcsio
            self._file = gcsio.GcsIO().open(self.source.file_path, 'rb')
        else:
            self._file = open(self.source.file_path, 'rb')
        # Determine the real end_offset.
        # If not specified it will be the length of the file.
        if self.end_offset is None:
            self._file.seek(0, os.SEEK_END)
            self.end_offset = self._file.tell()

        if self.start_offset is None:
            self.start_offset = 0
            self.current_offset = self.start_offset
        if self.start_offset > 0:
            # Read one byte before. This operation will either consume a previous
            # newline if start_offset was at the beginning of a line or consume the
            # line if we were in the middle of it. Either way we get the read position
            # exactly where we wanted: at the begining of the first full line.
            self._file.seek(self.start_offset - 1)
            self.current_offset -= 1
            line = self._file.readline()
            self.current_offset += len(line)
        else:
            self._file.seek(self.start_offset)

        # Initializing range tracker after start and end offsets are finalized.
        self.range_tracker = range_trackers.OffsetRangeTracker(
            self.start_offset, self.end_offset)

        return self
Ejemplo n.º 2
0
    def test_split_at_offset(self):
        tracker = range_trackers.OffsetRangeTracker(100, 200)
        self.assertTrue(tracker.try_return_record_at(True, 110))
        # Example positions we shouldn't split at, when last record starts at 110:
        self.assertFalse(tracker.try_split_at_position(109))
        self.assertFalse(tracker.try_split_at_position(110))
        self.assertFalse(tracker.try_split_at_position(200))
        self.assertFalse(tracker.try_split_at_position(210))
        # Example positions we *should* split at:
        self.assertTrue(copy.copy(tracker).try_split_at_position(111))
        self.assertTrue(copy.copy(tracker).try_split_at_position(129))
        self.assertTrue(copy.copy(tracker).try_split_at_position(130))
        self.assertTrue(copy.copy(tracker).try_split_at_position(131))
        self.assertTrue(copy.copy(tracker).try_split_at_position(150))
        self.assertTrue(copy.copy(tracker).try_split_at_position(199))

        # If we split at 170 and then at 150:
        self.assertTrue(tracker.try_split_at_position(170))
        self.assertTrue(tracker.try_split_at_position(150))
        # Should be able  to return a record starting before the new stop offset.
        # Returning records starting at the same offset is ok.
        self.assertTrue(copy.copy(tracker).try_return_record_at(True, 135))
        self.assertTrue(copy.copy(tracker).try_return_record_at(True, 135))
        # Should be able to return a record starting right before the new stop
        # offset.
        self.assertTrue(copy.copy(tracker).try_return_record_at(True, 149))
        # Should not be able to return a record starting at or after the new stop
        # offset.
        self.assertFalse(tracker.try_return_record_at(True, 150))
        self.assertFalse(tracker.try_return_record_at(True, 151))
        # Should accept non-splitpoint records starting after stop offset.
        self.assertTrue(tracker.try_return_record_at(False, 135))
        self.assertTrue(tracker.try_return_record_at(False, 152))
        self.assertTrue(tracker.try_return_record_at(False, 160))
        self.assertTrue(tracker.try_return_record_at(False, 171))
Ejemplo n.º 3
0
 def test_everything_with_unbounded_range(self):
     tracker = range_trackers.OffsetRangeTracker(
         100, range_trackers.OffsetRangeTracker.OFFSET_INFINITY)
     self.assertTrue(tracker.try_return_record_at(True, 150))
     self.assertTrue(tracker.try_return_record_at(True, 250))
     # get_position_for_fraction_consumed should fail for an unbounded range
     with self.assertRaises(Exception):
         tracker.get_position_for_fraction_consumed(0.5)
Ejemplo n.º 4
0
    def __init__(self, source):
        self._source = source

        # Index of the last item returned by InMemoryReader.
        # Initialized to None.
        self._current_index = None

        self._range_tracker = range_trackers.OffsetRangeTracker(
            self._source.start_index, self._source.end_index)
Ejemplo n.º 5
0
 def test_get_fraction_consumed_sparse(self):
     tracker = range_trackers.OffsetRangeTracker(100, 200)
     self.assertEqual(0, tracker.fraction_consumed())
     self.assertTrue(tracker.try_claim(110))
     # Consumed positions through 110 = total 10 positions of 100 done.
     self.assertEqual(0.10, tracker.fraction_consumed())
     self.assertTrue(tracker.try_claim(150))
     self.assertEqual(0.50, tracker.fraction_consumed())
     self.assertTrue(tracker.try_claim(195))
     self.assertEqual(0.95, tracker.fraction_consumed())
Ejemplo n.º 6
0
 def test_get_fraction_consumed_sparse(self):
     tracker = range_trackers.OffsetRangeTracker(100, 200)
     self.assertEqual(0, tracker.fraction_consumed)
     self.assertTrue(tracker.try_return_record_at(True, 110))
     # Consumed positions through 110 = total 11 positions of 100.
     self.assertEqual(0.11, tracker.fraction_consumed)
     self.assertTrue(tracker.try_return_record_at(True, 150))
     self.assertEqual(0.51, tracker.fraction_consumed)
     self.assertTrue(tracker.try_return_record_at(True, 195))
     self.assertEqual(0.96, tracker.fraction_consumed)
Ejemplo n.º 7
0
 def test_try_return_record_continuous_until_split_point(self):
     tracker = range_trackers.OffsetRangeTracker(9, 18)
     # Return records with gaps of 2; every 3rd record is a split point.
     self.assertTrue(tracker.try_claim(10))
     tracker.set_current_position(12)
     tracker.set_current_position(14)
     self.assertTrue(tracker.try_claim(16))
     # Out of range, but not a split point...
     tracker.set_current_position(18)
     tracker.set_current_position(20)
     # Out of range AND a split point.
     self.assertFalse(tracker.try_claim(22))
Ejemplo n.º 8
0
 def test_try_return_record_continuous_until_split_point(self):
     tracker = range_trackers.OffsetRangeTracker(9, 18)
     # Return records with gaps of 2; every 3rd record is a split point.
     self.assertTrue(tracker.try_return_record_at(True, 10))
     self.assertTrue(tracker.try_return_record_at(False, 12))
     self.assertTrue(tracker.try_return_record_at(False, 14))
     self.assertTrue(tracker.try_return_record_at(True, 16))
     # Out of range, but not a split point...
     self.assertTrue(tracker.try_return_record_at(False, 18))
     self.assertTrue(tracker.try_return_record_at(False, 20))
     # Out of range AND a split point.
     self.assertFalse(tracker.try_return_record_at(True, 22))
Ejemplo n.º 9
0
 def test_get_position_for_fraction_dense(self):
     # Represents positions 3, 4, 5.
     tracker = range_trackers.OffsetRangeTracker(3, 6)
     # [3, 3) represents 0.0 of [3, 6)
     self.assertEqual(3, tracker.position_at_fraction(0.0))
     # [3, 4) represents up to 1/3 of [3, 6)
     self.assertEqual(4, tracker.position_at_fraction(1.0 / 6))
     self.assertEqual(4, tracker.position_at_fraction(0.333))
     # [3, 5) represents up to 2/3 of [3, 6)
     self.assertEqual(5, tracker.position_at_fraction(0.334))
     self.assertEqual(5, tracker.position_at_fraction(0.666))
     # Any fraction consumed over 2/3 means the whole [3, 6) has been consumed.
     self.assertEqual(6, tracker.position_at_fraction(0.667))
Ejemplo n.º 10
0
 def test_get_fraction_consumed_dense(self):
     tracker = range_trackers.OffsetRangeTracker(3, 6)
     self.assertEqual(0, tracker.fraction_consumed())
     self.assertTrue(tracker.try_claim(3))
     self.assertEqual(0.0, tracker.fraction_consumed())
     self.assertTrue(tracker.try_claim(4))
     self.assertEqual(1.0 / 3, tracker.fraction_consumed())
     self.assertTrue(tracker.try_claim(5))
     self.assertEqual(2.0 / 3, tracker.fraction_consumed())
     tracker.set_current_position(6)
     self.assertEqual(1.0, tracker.fraction_consumed())
     tracker.set_current_position(7)
     self.assertFalse(tracker.try_claim(7))
Ejemplo n.º 11
0
 def test_get_fraction_consumed_dense(self):
     tracker = range_trackers.OffsetRangeTracker(3, 6)
     self.assertEqual(0, tracker.fraction_consumed)
     self.assertTrue(tracker.try_return_record_at(True, 3))
     self.assertEqual(1.0 / 3, tracker.fraction_consumed)
     self.assertTrue(tracker.try_return_record_at(True, 4))
     self.assertEqual(2.0 / 3, tracker.fraction_consumed)
     self.assertTrue(tracker.try_return_record_at(True, 5))
     self.assertEqual(1.0, tracker.fraction_consumed)
     self.assertTrue(tracker.try_return_record_at(False,
                                                  6))  # non-split-point
     self.assertTrue(tracker.try_return_record_at(False,
                                                  7))  # non-split-point
     self.assertFalse(tracker.try_return_record_at(True, 7))
Ejemplo n.º 12
0
 def get_range_tracker(self, start_position, stop_position):
   if self._test_range_tracker_fn:
     return self._test_range_tracker_fn()
   else:
     return range_trackers.OffsetRangeTracker(start_position, stop_position)
Ejemplo n.º 13
0
 def test_split_at_offset_fails_if_unstarted(self):
     tracker = range_trackers.OffsetRangeTracker(100, 200)
     self.assertFalse(tracker.try_split_at_position(150))
Ejemplo n.º 14
0
 def test_try_return_first_record_not_split_point(self):
     with self.assertRaises(Exception):
         range_trackers.OffsetRangeTracker(100,
                                           200).set_current_position(120)
Ejemplo n.º 15
0
 def test_try_return_record_simple_dense(self):
     tracker = range_trackers.OffsetRangeTracker(3, 6)
     self.assertTrue(tracker.try_return_record_at(True, 3))
     self.assertTrue(tracker.try_return_record_at(True, 4))
     self.assertTrue(tracker.try_return_record_at(True, 5))
     self.assertFalse(tracker.try_return_record_at(True, 6))
Ejemplo n.º 16
0
 def test_try_return_record_non_monotonic(self):
     tracker = range_trackers.OffsetRangeTracker(100, 200)
     self.assertTrue(tracker.try_claim(120))
     with self.assertRaises(Exception):
         tracker.try_claim(110)
Ejemplo n.º 17
0
 def test_try_return_record_simple_sparse(self):
     tracker = range_trackers.OffsetRangeTracker(100, 200)
     self.assertTrue(tracker.try_return_record_at(True, 110))
     self.assertTrue(tracker.try_return_record_at(True, 140))
     self.assertTrue(tracker.try_return_record_at(True, 183))
     self.assertFalse(tracker.try_return_record_at(True, 210))
Ejemplo n.º 18
0
 def test_try_return_record_non_monotonic(self):
     tracker = range_trackers.OffsetRangeTracker(100, 200)
     tracker.try_return_record_at(True, 120)
     with self.assertRaises(Exception):
         tracker.try_return_record_at(True, 110)
Ejemplo n.º 19
0
 def test_try_return_first_record_not_split_point(self):
     with self.assertRaises(Exception):
         range_trackers.OffsetRangeTracker(100, 200).try_return_record_at(
             False, 120)
Ejemplo n.º 20
0
 def test_try_return_record_simple_dense(self):
     tracker = range_trackers.OffsetRangeTracker(3, 6)
     self.assertTrue(tracker.try_claim(3))
     self.assertTrue(tracker.try_claim(4))
     self.assertTrue(tracker.try_claim(5))
     self.assertFalse(tracker.try_claim(6))
Ejemplo n.º 21
0
 def create_dummy_tracker():
   return range_trackers.OffsetRangeTracker(0, 3)