def run_update_stop_position(self, start_offset, end_offset, stop_offset, records_to_read, file_path): source = fileio.TextFileSource(file_path, start_offset, end_offset) records_of_first_split = '' with source.reader() as reader: reader_iter = iter(reader) i = 0 try: while i < records_to_read: records_of_first_split += next(reader_iter) i += 1 except StopIteration: # Invalid case, given source does not contain this many records. return last_record_start_after_reading = reader.range_tracker.last_record_start if stop_offset <= last_record_start_after_reading: expected_split_response = None elif stop_offset == start_offset or stop_offset == end_offset: expected_split_response = None elif records_to_read == 0: expected_split_response = None # unstarted else: expected_split_response = iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition( byte_offset=stop_offset)) split_response = self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(progress=iobase.ReaderProgress( iobase.ReaderPosition(byte_offset=stop_offset))), expected_split_response) # Reading remaining records from the updated reader. for line in reader: records_of_first_split += line if split_response is not None: # Total contents received by reading the two splits should be equal to the # result obtained by reading the original source. records_of_original = '' records_of_second_split = '' with source.reader() as original_reader: for line in original_reader: records_of_original += line new_source = fileio.TextFileSource( file_path, split_response.stop_position.byte_offset, end_offset) with new_source.reader() as reader: for line in reader: records_of_second_split += line self.assertEqual(records_of_original, records_of_first_split + records_of_second_split)
def test_read_do_shuffle_write(self): input_path = self.create_temp_file('a\nb\nc\nd\n') work_spec = [ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=8, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerShuffleWrite(shuffle_kind='group_keys', shuffle_writer_config='none', input=(1, 0), output_coders=(self.SHUFFLE_CODER, )) ] shuffle_sink_mock = mock.MagicMock() executor.MapTaskExecutor().execute(make_map_task(work_spec), test_shuffle_sink=shuffle_sink_mock) # Make sure we have seen all the (k, v) writes. shuffle_sink_mock.writer().Write.assert_has_calls([ mock.call('a', '', 1), mock.call('b', '', 1), mock.call('c', '', 1), mock.call('d', '', 1) ])
def test_read_do_write_with_start_bundle(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path finish_path = '%s.finish' % input_path executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( DoFnUsingStartBundle(finish_path)), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), make_text_sink(output_path, input=(1, 0)) ])) with open(output_path) as f: self.assertEqual('XYZ: 01234567890123456789\n', f.read()) # Check that the finish_bundle method of the custom DoFn object left the # expected side-effect by writing a file with a specific content. with open(finish_path) as f: self.assertEqual('finish called.', f.read())
def test_create_do_with_side_text_file_write(self): input_path = self.create_temp_file('x\ny\n') elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('textfile', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path, start_offset=None, end_offset=None, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), tag='textfile')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def test_read_do_write(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( fileio.TextFileSource(file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: 01234567890123456789\n', f.read())
def test_read_entire_file(self): lines = ['First', 'Second', 'Third'] source = fileio.TextFileSource( file_path=self.create_temp_file('\n'.join(lines))) read_lines = [] with source.reader() as reader: for line in reader: read_lines.append(line) self.assertEqual(read_lines, lines)
def test_create_do_with_side_avro_file_write(self): input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x')) input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y')) elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('sometag', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), # Note that the two side inputs have the same tag. This is quite # common for intermediary PCollections used as side inputs that # are saved as AVRO files. The files will contain the sharded # PCollection. side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path1, coder=coders.Base64PickleCoder()), tag='sometag'), maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path2, coder=coders.Base64PickleCoder()), tag='sometag') ], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER, )) ])) # The side source was specified as collection therefore we should see # all three elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def read_with_offsets(self, input_lines, output_lines, start_offset=None, end_offset=None): source = fileio.TextFileSource( file_path=self.create_temp_file('\n'.join(input_lines)), start_offset=start_offset, end_offset=end_offset) read_lines = [] with source.reader() as reader: for line in reader: read_lines.append(line) self.assertEqual(read_lines, output_lines)
def test_progress_entire_file(self): lines = ['First', 'Second', 'Third'] source = fileio.TextFileSource( file_path=self.create_temp_file('\n'.join(lines))) progress_record = [] with source.reader() as reader: self.assertEqual(-1, reader.get_progress().position.byte_offset) for line in reader: self.assertIsNotNone(line) progress_record.append(reader.get_progress().position.byte_offset) self.assertEqual(13, reader.get_progress().position.byte_offset) self.assertEqual(len(progress_record), 3) self.assertEqual(progress_record, [0, 6, 13])
def progress_with_offsets(self, input_lines, start_offset=None, end_offset=None): source = fileio.TextFileSource( file_path=self.create_temp_file('\n'.join(input_lines)), start_offset=start_offset, end_offset=end_offset) progress_record = [] with source.reader() as reader: self.assertEqual(reader.get_progress().position.byte_offset, -1) for line in reader: self.assertIsNotNone(line) progress_record.append(reader.get_progress().position.byte_offset) previous = 0 for current in progress_record: self.assertGreater(current, previous) previous = current
def test_update_stop_position_percent_complete_for_position(self): lines = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee'] source = fileio.TextFileSource( file_path=self.create_temp_file('\n'.join(lines))) with source.reader() as reader: # Reading two lines reader_iter = iter(reader) next(reader_iter) next(reader_iter) next(reader_iter) # Splitting at end of the range should be unsuccessful self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=0))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=25))), None) # Splitting at positions on or before start offset of the last record self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=5))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=10))), None) # Splitting at a position after the start offset of the last record should # be successful self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=15))), iobase.DynamicSplitResultWithPosition( iobase.ReaderPosition(byte_offset=15)))
def test_read_do_write_with_undeclared_output(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( DoFnUsingWithUndeclaredSideOutput()), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), make_text_sink(output_path, input=(1, 0)) ]) executor.MapTaskExecutor(work_item.map_task).execute() with open(output_path) as f: self.assertEqual('01234567890123456789\n', f.read())