Esempio n. 1
0
    def run_update_stop_position(self, start_offset, end_offset, stop_offset,
                                 records_to_read, file_path):
        source = fileio.TextFileSource(file_path, start_offset, end_offset)

        records_of_first_split = ''

        with source.reader() as reader:
            reader_iter = iter(reader)
            i = 0

            try:
                while i < records_to_read:
                    records_of_first_split += next(reader_iter)
                    i += 1
            except StopIteration:
                # Invalid case, given source does not contain this many records.
                return

            last_record_start_after_reading = reader.range_tracker.last_record_start

            if stop_offset <= last_record_start_after_reading:
                expected_split_response = None
            elif stop_offset == start_offset or stop_offset == end_offset:
                expected_split_response = None
            elif records_to_read == 0:
                expected_split_response = None  # unstarted
            else:
                expected_split_response = iobase.DynamicSplitResultWithPosition(
                    stop_position=iobase.ReaderPosition(
                        byte_offset=stop_offset))

            split_response = self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(progress=iobase.ReaderProgress(
                    iobase.ReaderPosition(byte_offset=stop_offset))),
                expected_split_response)

            # Reading remaining records from the updated reader.
            for line in reader:
                records_of_first_split += line

        if split_response is not None:
            # Total contents received by reading the two splits should be equal to the
            # result obtained by reading the original source.
            records_of_original = ''
            records_of_second_split = ''

            with source.reader() as original_reader:
                for line in original_reader:
                    records_of_original += line

            new_source = fileio.TextFileSource(
                file_path, split_response.stop_position.byte_offset,
                end_offset)
            with new_source.reader() as reader:
                for line in reader:
                    records_of_second_split += line

            self.assertEqual(records_of_original,
                             records_of_first_split + records_of_second_split)
Esempio n. 2
0
    def request_dynamic_split(self, dynamic_split_request):
        assert dynamic_split_request is not None
        progress = dynamic_split_request.progress
        split_position = progress.position
        if split_position is None:
            percent_complete = progress.percent_complete
            if percent_complete is not None:
                if percent_complete <= 0 or percent_complete >= 1:
                    logging.warning(
                        'FileBasedReader cannot be split since the provided percentage '
                        'of work to be completed is out of the valid range (0, '
                        '1). Requested: %r', dynamic_split_request)
                    return
                split_position = iobase.ReaderPosition()
                split_position.byte_offset = (
                    self.range_tracker.position_at_fraction(percent_complete))
            else:
                logging.warning(
                    'TextReader requires either a position or a percentage of work to '
                    'be complete to perform a dynamic split request. Requested: %r',
                    dynamic_split_request)
                return

        if self.range_tracker.try_split(split_position.byte_offset):
            return iobase.DynamicSplitResultWithPosition(split_position)
        else:
            return
Esempio n. 3
0
 def get_progress(self):
   last_group_start = self._range_tracker.last_group_start()
   if last_group_start is None:
     return None
   reader_position = iobase.ReaderPosition(
       shuffle_position=base64.urlsafe_b64encode(last_group_start))
   return iobase.ReaderProgress(position=reader_position)
Esempio n. 4
0
    def request_dynamic_split(self, dynamic_split_request):
        assert dynamic_split_request is not None
        split_request_progress = dynamic_split_request.progress
        if split_request_progress.position is None:
            logging.warning(
                'GroupingShuffleReader only supports split at a Position.'
                ' Requested: %r', dynamic_split_request)
            return
        encoded_shuffle_position = split_request_progress.position.shuffle_position
        if encoded_shuffle_position is None:
            logging.warning(
                'GroupingShuffleReader only supports split at a shuffle'
                ' position. Requested: %r', split_request_progress.position)
            return

        if self._range_tracker.try_split_at_position(
                _shuffle_decode(encoded_shuffle_position)):
            logging.info('Split GroupedShuffleReader at %s',
                         encoded_shuffle_position)
            split_position = iobase.ReaderPosition(
                shuffle_position=encoded_shuffle_position)
            return iobase.DynamicSplitResultWithPosition(split_position)
        else:
            logging.info('Refusing to split GroupedShuffleReader %r at %s',
                         self, encoded_shuffle_position)
Esempio n. 5
0
    def test_dynamic_split_result_with_position_to_cloud_stop_position(self):
        position = iobase.ReaderPosition(byte_offset=9999)
        dynamic_split_result = iobase.DynamicSplitResultWithPosition(position)

        approximate_position = (
            apiclient.
            dynamic_split_result_with_position_to_cloud_stop_position(
                dynamic_split_result))
        self.assertIsNotNone(approximate_position)
        self.assertIsInstance(approximate_position, dataflow.Position)
        self.assertEqual(9999, approximate_position.byteOffset)
Esempio n. 6
0
    def test_reader_progress_to_cloud_progress_position(self):
        reader_position = iobase.ReaderPosition(byte_offset=9999)
        reader_progress = iobase.ReaderProgress(position=reader_position)

        cloud_progress = apiclient.reader_progress_to_cloud_progress(
            reader_progress)
        self.assertIsNotNone(cloud_progress)
        self.assertIsInstance(cloud_progress, dataflow.ApproximateProgress)
        self.assertIsNotNone(cloud_progress.position)
        self.assertIsInstance(cloud_progress.position, dataflow.Position)
        self.assertEquals(9999, cloud_progress.position.byteOffset)
Esempio n. 7
0
  def test_dynamic_splitting_with_range(self):
    source = GroupedShuffleSource(
        config_bytes='not used',
        coder=Base64Coder(),
        start_position=base64.urlsafe_b64encode('0'),
        end_position=base64.urlsafe_b64encode('3'))

    chunks = [TEST_CHUNK1, TEST_CHUNK2]

    with source.reader(test_reader=FakeShuffleReader(chunks)) as reader:
      reader_iter = iter(reader)
      next(reader_iter)

      # Cannot split if split request is out of range
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('0')))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('3')))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('4')))),
          None)

      # Successful split.
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('2')))),
          iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition(
              shuffle_position=base64.urlsafe_b64encode('2'))))
Esempio n. 8
0
  def test_dynamic_splitting(self):
    source = GroupedShuffleSource(
        config_bytes='not used', coder=Base64Coder())

    chunks = [TEST_CHUNK1, TEST_CHUNK2]

    with source.reader(test_reader=FakeShuffleReader(chunks)) as reader:
      # Cannot split an unstarted reader
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('1')))),
          None)

      reader_iter = iter(reader)
      next(reader_iter)
      next(reader_iter)
      # Cannot split since the provided split position is smaller than or equal
      # to the current position '1'.
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('0')))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('1')))),
          None)

      # Successful split.
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('3')))),
          iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition(
              shuffle_position=base64.urlsafe_b64encode('3'))))
Esempio n. 9
0
def cloud_position_to_reader_position(cloud_position):
    concat_position = None
    if cloud_position.concatPosition is not None:
        inner_position = cloud_position_to_reader_position(
            cloud_position.concatPosition.position)
        concat_position = iobase.ConcatPosition(cloud_position.index,
                                                inner_position)

    return iobase.ReaderPosition(cloud_position.end, cloud_position.key,
                                 cloud_position.byteOffset,
                                 cloud_position.recordIndex,
                                 cloud_position.shufflePosition,
                                 concat_position)
  def get_progress(self):
    if self.current_reader_index < 0 or self.current_reader is None:
      return

    index = self.current_reader_index
    inner_position = None

    sub_reader_progress = self.current_reader.get_progress()
    if sub_reader_progress is not None:
      sub_reader_position = sub_reader_progress.position
      if sub_reader_position is not None:
        inner_position = sub_reader_position
      else:
        raise ValueError('A concat source should only be created with '
                         'sub-sources that create readers that perform '
                         'progress reporting and dynamic work rebalancing '
                         'using positions')
      return iobase.ReaderProgress(
          position=iobase.ReaderPosition(
              concat_position=iobase.ConcatPosition(index, inner_position)))
Esempio n. 11
0
  def test_update_stop_position_for_percent_complete(self):
    lines = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee']
    source = fileio.TextFileSource(
        file_path=self.create_temp_file('\n'.join(lines)))
    with source.reader() as reader:
      # Reading two lines
      reader_iter = iter(reader)
      next(reader_iter)
      next(reader_iter)
      next(reader_iter)

      # Splitting at end of the range should be unsuccessful
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete=0)),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete=1)),
          None)

      # Splitting at positions on or before start offset of the last record
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete=
                                                           0.2)),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete=
                                                           0.4)),
          None)

      # Splitting at a position after the start offset of the last record should
      # be successful
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete=
                                                           0.6)),
          iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition(
              byte_offset=15)))
Esempio n. 12
0
  def test_in_memory_source_dynamic_split(self):
    source = inmemory.InMemorySource([10, 20, 30, 40, 50, 60],
                                     coder=FakeCoder())

    # Unstarted reader
    with source.reader() as reader:
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          None)

    # Proposed split position out of range
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=-1))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=10))),
          None)

    # Already read past proposed split position
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      next(reader_iter)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=1))),
          None)

      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          None)

    # Successful split
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=4))),
          iobase.DynamicSplitResultWithPosition(
              stop_position=iobase.ReaderPosition(record_index=4)))

      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          iobase.DynamicSplitResultWithPosition(
              stop_position=iobase.ReaderPosition(record_index=2)))
Esempio n. 13
0
    def get_progress(self):
        if self._current_index is None:
            return None

        return iobase.ReaderProgress(position=iobase.ReaderPosition(
            record_index=self._current_index))
Esempio n. 14
0
 def get_progress(self):
     return iobase.ReaderProgress(position=iobase.ReaderPosition(
         byte_offset=self.range_tracker.last_record_start))
 def get_progress(self):
     return iobase.ReaderProgress(position=iobase.ReaderPosition(
         record_index=self.current_index))
Esempio n. 16
0
    def test_reader_position_to_cloud_position(self):
        reader_position = iobase.ReaderPosition(byte_offset=9999)

        cloud_position = apiclient.reader_position_to_cloud_position(
            reader_position)
        self.assertIsNotNone(cloud_position)