def run_update_stop_position(self, start_offset, end_offset, stop_offset, records_to_read, file_path): source = fileio.TextFileSource(file_path, start_offset, end_offset) records_of_first_split = '' with source.reader() as reader: reader_iter = iter(reader) i = 0 try: while i < records_to_read: records_of_first_split += next(reader_iter) i += 1 except StopIteration: # Invalid case, given source does not contain this many records. return last_record_start_after_reading = reader.range_tracker.last_record_start if stop_offset <= last_record_start_after_reading: expected_split_response = None elif stop_offset == start_offset or stop_offset == end_offset: expected_split_response = None elif records_to_read == 0: expected_split_response = None # unstarted else: expected_split_response = iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition( byte_offset=stop_offset)) split_response = self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(progress=iobase.ReaderProgress( iobase.ReaderPosition(byte_offset=stop_offset))), expected_split_response) # Reading remaining records from the updated reader. for line in reader: records_of_first_split += line if split_response is not None: # Total contents received by reading the two splits should be equal to the # result obtained by reading the original source. records_of_original = '' records_of_second_split = '' with source.reader() as original_reader: for line in original_reader: records_of_original += line new_source = fileio.TextFileSource( file_path, split_response.stop_position.byte_offset, end_offset) with new_source.reader() as reader: for line in reader: records_of_second_split += line self.assertEqual(records_of_original, records_of_first_split + records_of_second_split)
def request_dynamic_split(self, dynamic_split_request): assert dynamic_split_request is not None progress = dynamic_split_request.progress split_position = progress.position if split_position is None: percent_complete = progress.percent_complete if percent_complete is not None: if percent_complete <= 0 or percent_complete >= 1: logging.warning( 'FileBasedReader cannot be split since the provided percentage ' 'of work to be completed is out of the valid range (0, ' '1). Requested: %r', dynamic_split_request) return split_position = iobase.ReaderPosition() split_position.byte_offset = ( self.range_tracker.position_at_fraction(percent_complete)) else: logging.warning( 'TextReader requires either a position or a percentage of work to ' 'be complete to perform a dynamic split request. Requested: %r', dynamic_split_request) return if self.range_tracker.try_split(split_position.byte_offset): return iobase.DynamicSplitResultWithPosition(split_position) else: return
def get_progress(self): last_group_start = self._range_tracker.last_group_start() if last_group_start is None: return None reader_position = iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode(last_group_start)) return iobase.ReaderProgress(position=reader_position)
def request_dynamic_split(self, dynamic_split_request): assert dynamic_split_request is not None split_request_progress = dynamic_split_request.progress if split_request_progress.position is None: logging.warning( 'GroupingShuffleReader only supports split at a Position.' ' Requested: %r', dynamic_split_request) return encoded_shuffle_position = split_request_progress.position.shuffle_position if encoded_shuffle_position is None: logging.warning( 'GroupingShuffleReader only supports split at a shuffle' ' position. Requested: %r', split_request_progress.position) return if self._range_tracker.try_split_at_position( _shuffle_decode(encoded_shuffle_position)): logging.info('Split GroupedShuffleReader at %s', encoded_shuffle_position) split_position = iobase.ReaderPosition( shuffle_position=encoded_shuffle_position) return iobase.DynamicSplitResultWithPosition(split_position) else: logging.info('Refusing to split GroupedShuffleReader %r at %s', self, encoded_shuffle_position)
def test_dynamic_split_result_with_position_to_cloud_stop_position(self): position = iobase.ReaderPosition(byte_offset=9999) dynamic_split_result = iobase.DynamicSplitResultWithPosition(position) approximate_position = ( apiclient. dynamic_split_result_with_position_to_cloud_stop_position( dynamic_split_result)) self.assertIsNotNone(approximate_position) self.assertIsInstance(approximate_position, dataflow.Position) self.assertEqual(9999, approximate_position.byteOffset)
def test_reader_progress_to_cloud_progress_position(self): reader_position = iobase.ReaderPosition(byte_offset=9999) reader_progress = iobase.ReaderProgress(position=reader_position) cloud_progress = apiclient.reader_progress_to_cloud_progress( reader_progress) self.assertIsNotNone(cloud_progress) self.assertIsInstance(cloud_progress, dataflow.ApproximateProgress) self.assertIsNotNone(cloud_progress.position) self.assertIsInstance(cloud_progress.position, dataflow.Position) self.assertEquals(9999, cloud_progress.position.byteOffset)
def test_dynamic_splitting_with_range(self): source = GroupedShuffleSource( config_bytes='not used', coder=Base64Coder(), start_position=base64.urlsafe_b64encode('0'), end_position=base64.urlsafe_b64encode('3')) chunks = [TEST_CHUNK1, TEST_CHUNK2] with source.reader(test_reader=FakeShuffleReader(chunks)) as reader: reader_iter = iter(reader) next(reader_iter) # Cannot split if split request is out of range self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('0')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('4')))), None) # Successful split. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('2')))), iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('2'))))
def test_dynamic_splitting(self): source = GroupedShuffleSource( config_bytes='not used', coder=Base64Coder()) chunks = [TEST_CHUNK1, TEST_CHUNK2] with source.reader(test_reader=FakeShuffleReader(chunks)) as reader: # Cannot split an unstarted reader self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('1')))), None) reader_iter = iter(reader) next(reader_iter) next(reader_iter) # Cannot split since the provided split position is smaller than or equal # to the current position '1'. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('0')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('1')))), None) # Successful split. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3')))), iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3'))))
def cloud_position_to_reader_position(cloud_position): concat_position = None if cloud_position.concatPosition is not None: inner_position = cloud_position_to_reader_position( cloud_position.concatPosition.position) concat_position = iobase.ConcatPosition(cloud_position.index, inner_position) return iobase.ReaderPosition(cloud_position.end, cloud_position.key, cloud_position.byteOffset, cloud_position.recordIndex, cloud_position.shufflePosition, concat_position)
def get_progress(self): if self.current_reader_index < 0 or self.current_reader is None: return index = self.current_reader_index inner_position = None sub_reader_progress = self.current_reader.get_progress() if sub_reader_progress is not None: sub_reader_position = sub_reader_progress.position if sub_reader_position is not None: inner_position = sub_reader_position else: raise ValueError('A concat source should only be created with ' 'sub-sources that create readers that perform ' 'progress reporting and dynamic work rebalancing ' 'using positions') return iobase.ReaderProgress( position=iobase.ReaderPosition( concat_position=iobase.ConcatPosition(index, inner_position)))
def test_update_stop_position_for_percent_complete(self): lines = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee'] source = fileio.TextFileSource( file_path=self.create_temp_file('\n'.join(lines))) with source.reader() as reader: # Reading two lines reader_iter = iter(reader) next(reader_iter) next(reader_iter) next(reader_iter) # Splitting at end of the range should be unsuccessful self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete=0)), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete=1)), None) # Splitting at positions on or before start offset of the last record self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete= 0.2)), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete= 0.4)), None) # Splitting at a position after the start offset of the last record should # be successful self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress(percent_complete= 0.6)), iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition( byte_offset=15)))
def test_in_memory_source_dynamic_split(self): source = inmemory.InMemorySource([10, 20, 30, 40, 50, 60], coder=FakeCoder()) # Unstarted reader with source.reader() as reader: self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), None) # Proposed split position out of range with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=-1))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=10))), None) # Already read past proposed split position with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) next(reader_iter) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=1))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), None) # Successful split with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=4))), iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition(record_index=4))) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition(record_index=2)))
def get_progress(self): if self._current_index is None: return None return iobase.ReaderProgress(position=iobase.ReaderPosition( record_index=self._current_index))
def get_progress(self): return iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=self.range_tracker.last_record_start))
def get_progress(self): return iobase.ReaderProgress(position=iobase.ReaderPosition( record_index=self.current_index))
def test_reader_position_to_cloud_position(self): reader_position = iobase.ReaderPosition(byte_offset=9999) cloud_position = apiclient.reader_position_to_cloud_position( reader_position) self.assertIsNotNone(cloud_position)