def test_read_do_shuffle_write(self): input_path = self.create_temp_file('a\nb\nc\nd\n') work_spec = [ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=8, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerShuffleWrite(shuffle_kind='group_keys', shuffle_writer_config='none', input=(1, 0), output_coders=(self.SHUFFLE_CODER, )) ] shuffle_sink_mock = mock.MagicMock() executor.MapTaskExecutor().execute(make_map_task(work_spec), test_shuffle_sink=shuffle_sink_mock) # Make sure we have seen all the (k, v) writes. shuffle_sink_mock.writer().Write.assert_has_calls([ mock.call('a', '', 1), mock.call('b', '', 1), mock.call('c', '', 1), mock.call('d', '', 1) ])
def test_create_do_with_side_text_file_write(self): input_path = self.create_temp_file('x\ny\n') elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('textfile', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path, start_offset=None, end_offset=None, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), tag='textfile')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def test_read_do_write_with_start_bundle(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path finish_path = '%s.finish' % input_path executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( DoFnUsingStartBundle(finish_path)), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), make_text_sink(output_path, input=(1, 0)) ])) with open(output_path) as f: self.assertEqual('XYZ: 01234567890123456789\n', f.read()) # Check that the finish_bundle method of the custom DoFn object left the # expected side-effect by writing a file with a specific content. with open(finish_path) as f: self.assertEqual('finish called.', f.read())
def test_read_do_write(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( fileio.TextFileSource(file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: 01234567890123456789\n', f.read())
def __init__(self, topic, subscription=None, id_label=None, coder=coders.StrUtf8Coder()): self.topic = topic self.subscription = subscription self.id_label = id_label self.coder = coder
def __init__(self, file_path, start_offset=None, end_offset=None, compression_type='AUTO', strip_trailing_newlines=True, coder=coders.StrUtf8Coder()): """Initialize a TextSource. Args: file_path: The file path to read from as a local file path or a GCS gs:// path. The path can contain glob characters (*, ?, and [...] sets). start_offset: The byte offset in the source text file that the reader should start reading. By default is 0 (beginning of file). end_offset: The byte offset in the file that the reader should stop reading. By default it is the end of the file. compression_type: Used to handle compressed input files. Typical value is 'AUTO'. strip_trailing_newlines: Indicates whether this source should remove the newline char in each line it reads before decoding that line. coder: Coder used to decode each line. Raises: TypeError: if file_path is not a string. If the file_path contains glob characters then the start_offset and end_offset must not be specified. The 'start_offset' and 'end_offset' pair provide a mechanism to divide the text file into multiple pieces for individual sources. Because the offset is measured by bytes, some complication arises when the offset splits in the middle of a text line. To avoid the scenario where two adjacent sources each get a fraction of a line we adopt the following rules: If start_offset falls inside a line (any character except the firt one) then the source will skip the line and start with the next one. If end_offset falls inside a line (any character except the first one) then the source will contain that entire line. """ if not isinstance(file_path, basestring): raise TypeError('%s: file_path must be a string; got %r instead' % (self.__class__.__name__, file_path)) self.file_path = file_path self.start_offset = start_offset self.end_offset = end_offset self.compression_type = compression_type self.strip_trailing_newlines = strip_trailing_newlines self.coder = coder self.is_gcs_source = file_path.startswith('gs://')
def test_read_do_write_with_undeclared_output(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead(fileio.TextFileSource( file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( DoFnUsingWithUndeclaredSideOutput()), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), make_text_sink(output_path, input=(1, 0)) ]) executor.MapTaskExecutor(work_item.map_task).execute() with open(output_path) as f: self.assertEqual('01234567890123456789\n', f.read())
def __init__(self, topic, coder=coders.StrUtf8Coder()): self.topic = topic self.coder = coder