Esempio n. 1
0
    def run_update_stop_position(self, start_offset, end_offset, stop_offset,
                                 records_to_read, file_path):
        source = fileio.TextFileSource(file_path, start_offset, end_offset)

        records_of_first_split = ''

        with source.reader() as reader:
            reader_iter = iter(reader)
            i = 0

            try:
                while i < records_to_read:
                    records_of_first_split += next(reader_iter)
                    i += 1
            except StopIteration:
                # Invalid case, given source does not contain this many records.
                return

            last_record_start_after_reading = reader.range_tracker.last_record_start

            if stop_offset <= last_record_start_after_reading:
                expected_split_response = None
            elif stop_offset == start_offset or stop_offset == end_offset:
                expected_split_response = None
            elif records_to_read == 0:
                expected_split_response = None  # unstarted
            else:
                expected_split_response = iobase.DynamicSplitResultWithPosition(
                    stop_position=iobase.ReaderPosition(
                        byte_offset=stop_offset))

            split_response = self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(progress=iobase.ReaderProgress(
                    iobase.ReaderPosition(byte_offset=stop_offset))),
                expected_split_response)

            # Reading remaining records from the updated reader.
            for line in reader:
                records_of_first_split += line

        if split_response is not None:
            # Total contents received by reading the two splits should be equal to the
            # result obtained by reading the original source.
            records_of_original = ''
            records_of_second_split = ''

            with source.reader() as original_reader:
                for line in original_reader:
                    records_of_original += line

            new_source = fileio.TextFileSource(
                file_path, split_response.stop_position.byte_offset,
                end_offset)
            with new_source.reader() as reader:
                for line in reader:
                    records_of_second_split += line

            self.assertEqual(records_of_original,
                             records_of_first_split + records_of_second_split)
Esempio n. 2
0
    def request_dynamic_split(self, dynamic_split_request):
        assert dynamic_split_request is not None
        progress = dynamic_split_request.progress
        split_position = progress.position
        if split_position is None:
            percent_complete = progress.percent_complete
            if percent_complete is not None:
                if percent_complete <= 0 or percent_complete >= 1:
                    logging.warning(
                        'FileBasedReader cannot be split since the provided percentage '
                        'of work to be completed is out of the valid range (0, '
                        '1). Requested: %r', dynamic_split_request)
                    return
                split_position = iobase.ReaderPosition()
                split_position.byte_offset = (
                    self.range_tracker.position_at_fraction(percent_complete))
            else:
                logging.warning(
                    'TextReader requires either a position or a percentage of work to '
                    'be complete to perform a dynamic split request. Requested: %r',
                    dynamic_split_request)
                return

        if self.range_tracker.try_split(split_position.byte_offset):
            return iobase.DynamicSplitResultWithPosition(split_position)
        else:
            return
Esempio n. 3
0
    def request_dynamic_split(self, dynamic_split_request):
        assert dynamic_split_request is not None
        split_request_progress = dynamic_split_request.progress
        if split_request_progress.position is None:
            logging.warning(
                'GroupingShuffleReader only supports split at a Position.'
                ' Requested: %r', dynamic_split_request)
            return
        encoded_shuffle_position = split_request_progress.position.shuffle_position
        if encoded_shuffle_position is None:
            logging.warning(
                'GroupingShuffleReader only supports split at a shuffle'
                ' position. Requested: %r', split_request_progress.position)
            return

        if self._range_tracker.try_split_at_position(
                _shuffle_decode(encoded_shuffle_position)):
            logging.info('Split GroupedShuffleReader at %s',
                         encoded_shuffle_position)
            split_position = iobase.ReaderPosition(
                shuffle_position=encoded_shuffle_position)
            return iobase.DynamicSplitResultWithPosition(split_position)
        else:
            logging.info('Refusing to split GroupedShuffleReader %r at %s',
                         self, encoded_shuffle_position)
Esempio n. 4
0
    def test_dynamic_split_result_with_position_to_cloud_stop_position(self):
        position = iobase.ReaderPosition(byte_offset=9999)
        dynamic_split_result = iobase.DynamicSplitResultWithPosition(position)

        approximate_position = (
            apiclient.
            dynamic_split_result_with_position_to_cloud_stop_position(
                dynamic_split_result))
        self.assertIsNotNone(approximate_position)
        self.assertIsInstance(approximate_position, dataflow.Position)
        self.assertEqual(9999, approximate_position.byteOffset)
Esempio n. 5
0
    def test_update_stop_position_percent_complete_for_position(self):
        lines = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee']
        source = fileio.TextFileSource(
            file_path=self.create_temp_file('\n'.join(lines)))
        with source.reader() as reader:
            # Reading two lines
            reader_iter = iter(reader)
            next(reader_iter)
            next(reader_iter)
            next(reader_iter)

            # Splitting at end of the range should be unsuccessful
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=0))), None)
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=25))), None)

            # Splitting at positions on or before start offset of the last record
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=5))), None)
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=10))), None)

            # Splitting at a position after the start offset of the last record should
            # be successful
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=15))),
                iobase.DynamicSplitResultWithPosition(
                    iobase.ReaderPosition(byte_offset=15)))
Esempio n. 6
0
    def request_dynamic_split(self, dynamic_split_request):
        assert dynamic_split_request is not None
        progress = dynamic_split_request.progress
        split_position = progress.position
        if split_position is None:
            logging.debug(
                'InMemory reader only supports split requests that are '
                'based on positions. Received : %r', dynamic_split_request)
            return None

        index_position = split_position.record_index
        if index_position is None:
            logging.debug(
                'InMemory reader only supports split requests that are '
                'based on index positions. Received : %r',
                dynamic_split_request)
            return None

        if self._range_tracker.try_split(index_position):
            return iobase.DynamicSplitResultWithPosition(split_position)
Esempio n. 7
0
  def test_dynamic_splitting_with_range(self):
    source = GroupedShuffleSource(
        config_bytes='not used',
        coder=Base64Coder(),
        start_position=base64.urlsafe_b64encode('0'),
        end_position=base64.urlsafe_b64encode('3'))

    chunks = [TEST_CHUNK1, TEST_CHUNK2]

    with source.reader(test_reader=FakeShuffleReader(chunks)) as reader:
      reader_iter = iter(reader)
      next(reader_iter)

      # Cannot split if split request is out of range
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('0')))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('3')))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('4')))),
          None)

      # Successful split.
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('2')))),
          iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition(
              shuffle_position=base64.urlsafe_b64encode('2'))))
Esempio n. 8
0
  def test_dynamic_splitting(self):
    source = GroupedShuffleSource(
        config_bytes='not used', coder=Base64Coder())

    chunks = [TEST_CHUNK1, TEST_CHUNK2]

    with source.reader(test_reader=FakeShuffleReader(chunks)) as reader:
      # Cannot split an unstarted reader
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('1')))),
          None)

      reader_iter = iter(reader)
      next(reader_iter)
      next(reader_iter)
      # Cannot split since the provided split position is smaller than or equal
      # to the current position '1'.
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('0')))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('1')))),
          None)

      # Successful split.
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(iobase.ReaderProgress(
              position=iobase.ReaderPosition(
                  shuffle_position=base64.urlsafe_b64encode('3')))),
          iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition(
              shuffle_position=base64.urlsafe_b64encode('3'))))
Esempio n. 9
0
  def test_in_memory_source_dynamic_split(self):
    source = inmemory.InMemorySource([10, 20, 30, 40, 50, 60],
                                     coder=FakeCoder())

    # Unstarted reader
    with source.reader() as reader:
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          None)

    # Proposed split position out of range
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=-1))),
          None)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=10))),
          None)

    # Already read past proposed split position
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      next(reader_iter)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=1))),
          None)

      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          None)

    # Successful split
    with source.reader() as reader:
      reader_iter = iter(reader)
      next(reader_iter)
      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=4))),
          iobase.DynamicSplitResultWithPosition(
              stop_position=iobase.ReaderPosition(record_index=4)))

      self.try_splitting_reader_at(
          reader,
          iobase.DynamicSplitRequest(
              iobase.ReaderProgress(
                  position=iobase.ReaderPosition(record_index=2))),
          iobase.DynamicSplitResultWithPosition(
              stop_position=iobase.ReaderPosition(record_index=2)))