Esempio n. 1
0
 def test_read_do_shuffle_write(self):
     input_path = self.create_temp_file('a\nb\nc\nd\n')
     work_spec = [
         maptask.WorkerRead(fileio.TextFileSource(
             file_path=input_path,
             start_offset=0,
             end_offset=8,
             strip_trailing_newlines=True,
             coder=coders.StrUtf8Coder()),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])),
                            output_tags=['out'],
                            output_coders=[self.OUTPUT_CODER],
                            input=(0, 0),
                            side_inputs=None),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='none',
                                    input=(1, 0),
                                    output_coders=(self.SHUFFLE_CODER, ))
     ]
     shuffle_sink_mock = mock.MagicMock()
     executor.MapTaskExecutor().execute(make_map_task(work_spec),
                                        test_shuffle_sink=shuffle_sink_mock)
     # Make sure we have seen all the (k, v) writes.
     shuffle_sink_mock.writer().Write.assert_has_calls([
         mock.call('a', '', 1),
         mock.call('b', '', 1),
         mock.call('c', '', 1),
         mock.call('d', '', 1)
     ])
    def test_concat_source_to_shuffle_sink(self):
        work = workitem.get_work_items(
            get_concat_source_to_shuffle_sink_message())
        self.assertIsNotNone(work)
        expected_sub_sources = []
        expected_sub_sources.append(
            io.TextFileSource(file_path='gs://sort_g/input_small_files/'
                              'ascii_sort_1MB_input.0000006',
                              start_offset=0,
                              end_offset=1000000,
                              strip_trailing_newlines=True,
                              coder=CODER))
        expected_sub_sources.append(
            io.TextFileSource(file_path='gs://sort_g/input_small_files/'
                              'ascii_sort_1MB_input.0000007',
                              start_offset=0,
                              end_offset=1000000,
                              strip_trailing_newlines=True,
                              coder=CODER))

        expected_concat_source = concat_reader.ConcatSource(
            expected_sub_sources)

        self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
            maptask.WorkerRead(expected_concat_source, output_coders=[CODER]),
            maptask.WorkerDoFn(serialized_fn='code',
                               output_tags=['out'],
                               input=(1, 0),
                               side_inputs=[],
                               output_coders=[CODER]),
            maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                       shuffle_writer_config='opaque',
                                       input=(1, 0),
                                       output_coders=(CODER, ))
        ]))
 def test_text_source_to_shuffle_sink(self):
     work = workitem.get_work_items(
         get_text_source_to_shuffle_sink_message())
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerRead(io.TextFileSource(file_path='gs://somefile',
                                              start_offset=123,
                                              end_offset=123123,
                                              strip_trailing_newlines=True,
                                              coder=CODER),
                            output_coders=[CODER]),
         maptask.WorkerDoFn(serialized_fn='code',
                            output_tags=['out'],
                            input=(1, 0),
                            side_inputs=[],
                            output_coders=[CODER]),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='opaque',
                                    input=(1, 0),
                                    output_coders=(CODER, ))
     ]))