def run_update_stop_position(self, start_offset, end_offset, stop_offset, records_to_read, file_path): source = fileio.TextFileSource(file_path, start_offset, end_offset) records_of_first_split = '' with source.reader() as reader: reader_iter = iter(reader) i = 0 try: while i < records_to_read: records_of_first_split += next(reader_iter) i += 1 except StopIteration: # Invalid case, given source does not contain this many records. return last_record_start_after_reading = reader.range_tracker.last_record_start if stop_offset <= last_record_start_after_reading: expected_split_response = None elif stop_offset == start_offset or stop_offset == end_offset: expected_split_response = None elif records_to_read == 0: expected_split_response = None # unstarted else: expected_split_response = iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition( byte_offset=stop_offset)) split_response = self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(progress=iobase.ReaderProgress( iobase.ReaderPosition(byte_offset=stop_offset))), expected_split_response) # Reading remaining records from the updated reader. for line in reader: records_of_first_split += line if split_response is not None: # Total contents received by reading the two splits should be equal to the # result obtained by reading the original source. records_of_original = '' records_of_second_split = '' with source.reader() as original_reader: for line in original_reader: records_of_original += line new_source = fileio.TextFileSource( file_path, split_response.stop_position.byte_offset, end_offset) with new_source.reader() as reader: for line in reader: records_of_second_split += line self.assertEqual(records_of_original, records_of_first_split + records_of_second_split)
def test_update_stop_position_percent_complete_for_position(self): lines = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee'] source = fileio.TextFileSource( file_path=self.create_temp_file('\n'.join(lines))) with source.reader() as reader: # Reading two lines reader_iter = iter(reader) next(reader_iter) next(reader_iter) next(reader_iter) # Splitting at end of the range should be unsuccessful self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=0))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=25))), None) # Splitting at positions on or before start offset of the last record self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=5))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=10))), None) # Splitting at a position after the start offset of the last record should # be successful self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=15))), iobase.DynamicSplitResultWithPosition( iobase.ReaderPosition(byte_offset=15)))
def test_dynamic_splitting_with_range(self): source = GroupedShuffleSource( config_bytes='not used', coder=Base64Coder(), start_position=base64.urlsafe_b64encode('0'), end_position=base64.urlsafe_b64encode('3')) chunks = [TEST_CHUNK1, TEST_CHUNK2] with source.reader(test_reader=FakeShuffleReader(chunks)) as reader: reader_iter = iter(reader) next(reader_iter) # Cannot split if split request is out of range self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('0')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('4')))), None) # Successful split. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('2')))), iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('2'))))
def test_dynamic_splitting(self): source = GroupedShuffleSource( config_bytes='not used', coder=Base64Coder()) chunks = [TEST_CHUNK1, TEST_CHUNK2] with source.reader(test_reader=FakeShuffleReader(chunks)) as reader: # Cannot split an unstarted reader self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('1')))), None) reader_iter = iter(reader) next(reader_iter) next(reader_iter) # Cannot split since the provided split position is smaller than or equal # to the current position '1'. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('0')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('1')))), None) # Successful split. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3')))), iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3'))))
def test_in_memory_source_dynamic_split(self): source = inmemory.InMemorySource([10, 20, 30, 40, 50, 60], coder=FakeCoder()) # Unstarted reader with source.reader() as reader: self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), None) # Proposed split position out of range with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=-1))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=10))), None) # Already read past proposed split position with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) next(reader_iter) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=1))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), None) # Successful split with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=4))), iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition(record_index=4))) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition(record_index=2)))
def approximate_progress_to_dynamic_split_request(approximate_progress): return iobase.DynamicSplitRequest( cloud_progress_to_reader_progress(approximate_progress))