コード例 #1
0
 def test_read_do_shuffle_write(self):
     input_path = self.create_temp_file('a\nb\nc\nd\n')
     work_spec = [
         maptask.WorkerRead(fileio.TextFileSource(
             file_path=input_path,
             start_offset=0,
             end_offset=8,
             strip_trailing_newlines=True,
             coder=coders.StrUtf8Coder()),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])),
                            output_tags=['out'],
                            output_coders=[self.OUTPUT_CODER],
                            input=(0, 0),
                            side_inputs=None),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='none',
                                    input=(1, 0),
                                    output_coders=(self.SHUFFLE_CODER, ))
     ]
     shuffle_sink_mock = mock.MagicMock()
     executor.MapTaskExecutor().execute(make_map_task(work_spec),
                                        test_shuffle_sink=shuffle_sink_mock)
     # Make sure we have seen all the (k, v) writes.
     shuffle_sink_mock.writer().Write.assert_has_calls([
         mock.call('a', '', 1),
         mock.call('b', '', 1),
         mock.call('c', '', 1),
         mock.call('d', '', 1)
     ])
コード例 #2
0
 def test_create_do_with_side_text_file_write(self):
   input_path = self.create_temp_file('x\ny\n')
   elements = ['aa', 'bb']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=2),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, s) for s in side]),
               tag_and_type=('textfile', pvalue.IterablePCollectionView, ())),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(fileio.TextFileSource(
                   file_path=input_path, start_offset=None, end_offset=None,
                   strip_trailing_newlines=True,
                   coder=coders.StrUtf8Coder()),
                                             tag='textfile')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as collection therefore we should see
   # all elements of the side source.
   self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                    sorted(output_buffer))
コード例 #3
0
 def test_read_do_write_with_start_bundle(self):
     input_path = self.create_temp_file('01234567890123456789\n0123456789')
     output_path = '%s.out' % input_path
     finish_path = '%s.finish' % input_path
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(fileio.TextFileSource(
                 file_path=input_path,
                 start_offset=0,
                 end_offset=15,
                 strip_trailing_newlines=True,
                 coder=coders.StrUtf8Coder()),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                 DoFnUsingStartBundle(finish_path)),
                                output_tags=['out'],
                                output_coders=[self.OUTPUT_CODER],
                                input=(0, 0),
                                side_inputs=None),
             make_text_sink(output_path, input=(1, 0))
         ]))
     with open(output_path) as f:
         self.assertEqual('XYZ: 01234567890123456789\n', f.read())
     # Check that the finish_bundle method of the custom DoFn object left the
     # expected side-effect by writing a file with a specific content.
     with open(finish_path) as f:
         self.assertEqual('finish called.', f.read())
コード例 #4
0
 def test_read_do_write(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
コード例 #5
0
 def __init__(self,
              topic,
              subscription=None,
              id_label=None,
              coder=coders.StrUtf8Coder()):
     self.topic = topic
     self.subscription = subscription
     self.id_label = id_label
     self.coder = coder
コード例 #6
0
ファイル: fileio.py プロジェクト: volnt/DataflowPythonSDK
    def __init__(self,
                 file_path,
                 start_offset=None,
                 end_offset=None,
                 compression_type='AUTO',
                 strip_trailing_newlines=True,
                 coder=coders.StrUtf8Coder()):
        """Initialize a TextSource.

    Args:
      file_path: The file path to read from as a local file path or a GCS
        gs:// path. The path can contain glob characters (*, ?, and [...]
        sets).
      start_offset: The byte offset in the source text file that the reader
        should start reading. By default is 0 (beginning of file).
      end_offset: The byte offset in the file that the reader should stop
        reading. By default it is the end of the file.
      compression_type: Used to handle compressed input files. Typical value
          is 'AUTO'.
      strip_trailing_newlines: Indicates whether this source should remove
          the newline char in each line it reads before decoding that line.
      coder: Coder used to decode each line.

    Raises:
      TypeError: if file_path is not a string.

    If the file_path contains glob characters then the start_offset and
    end_offset must not be specified.

    The 'start_offset' and 'end_offset' pair provide a mechanism to divide the
    text file into multiple pieces for individual sources. Because the offset
    is measured by bytes, some complication arises when the offset splits in
    the middle of a text line. To avoid the scenario where two adjacent sources
    each get a fraction of a line we adopt the following rules:

    If start_offset falls inside a line (any character except the firt one)
    then the source will skip the line and start with the next one.

    If end_offset falls inside a line (any character except the first one) then
    the source will contain that entire line.
    """
        if not isinstance(file_path, basestring):
            raise TypeError('%s: file_path must be a string;  got %r instead' %
                            (self.__class__.__name__, file_path))

        self.file_path = file_path
        self.start_offset = start_offset
        self.end_offset = end_offset
        self.compression_type = compression_type
        self.strip_trailing_newlines = strip_trailing_newlines
        self.coder = coder

        self.is_gcs_source = file_path.startswith('gs://')
コード例 #7
0
    def test_read_do_write_with_undeclared_output(self):
        input_path = self.create_temp_file('01234567890123456789\n0123456789')
        output_path = '%s.out' % input_path
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(fileio.TextFileSource(
                file_path=input_path,
                start_offset=0,
                end_offset=15,
                strip_trailing_newlines=True,
                coder=coders.StrUtf8Coder()),
                               output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                DoFnUsingWithUndeclaredSideOutput()),
                               output_tags=['out'],
                               output_coders=[self.OUTPUT_CODER],
                               input=(0, 0),
                               side_inputs=None),
            make_text_sink(output_path, input=(1, 0))
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('01234567890123456789\n', f.read())
コード例 #8
0
 def __init__(self, topic, coder=coders.StrUtf8Coder()):
     self.topic = topic
     self.coder = coder