Ejemplo n.º 1
0
    def __init__(self, fn_or_label, *args, **kwargs):
        if fn_or_label is None or isinstance(fn_or_label, basestring):
            label = fn_or_label
            fn, args = args[0], args[1:]
        else:
            label = None
            fn = fn_or_label
        if isinstance(fn, type) and issubclass(fn, typehints.WithTypeHints):
            # Don't treat Fn class objects as callables.
            raise ValueError('Use %s() not %s.' % (fn.__name__, fn.__name__))
        self.fn = self.make_fn(fn)
        # Now that we figure out the label, initialize the super-class.
        super(PTransformWithSideInputs, self).__init__(label=label)

        if (any([isinstance(v, pvalue.PCollection) for v in args]) or any(
            [isinstance(v, pvalue.PCollection) for v in kwargs.itervalues()])):
            raise error.SideInputError(
                'PCollection used directly as side input argument. Specify '
                'AsIter(pcollection) or AsSingleton(pcollection) to indicate how the '
                'PCollection is to be used.')
        self.args, self.kwargs, self.side_inputs = util.remove_objects_from_args(
            args, kwargs, pvalue.PCollectionView)
        self.raw_side_inputs = args, kwargs

        # Prevent name collisions with fns of the form '<function <lambda> at ...>'
        self._cached_fn = self.fn

        # Ensure fn and side inputs are picklable for remote execution.
        self.fn = pickler.loads(pickler.dumps(self.fn))
        self.args = pickler.loads(pickler.dumps(self.args))
        self.kwargs = pickler.loads(pickler.dumps(self.kwargs))

        # For type hints, because loads(dumps(class)) != class.
        self.fn = self._cached_fn
Ejemplo n.º 2
0
  def __init__(self, fn_or_label, *args, **kwargs):
    if fn_or_label is None or isinstance(fn_or_label, basestring):
      label = fn_or_label
      fn, args = args[0], args[1:]
    else:
      label = None
      fn = fn_or_label
    if isinstance(fn, type) and issubclass(fn, typehints.WithTypeHints):
      # Don't treat Fn class objects as callables.
      raise ValueError('Use %s() not %s.' % (fn.__name__, fn.__name__))
    self.fn = self.make_fn(fn)
    # Now that we figure out the label, initialize the super-class.
    super(PTransformWithSideInputs, self).__init__(label=label)

    if (any([isinstance(v, pvalue.PCollection) for v in args]) or
        any([isinstance(v, pvalue.PCollection) for v in kwargs.itervalues()])):
      raise error.SideInputError(
          'PCollection used directly as side input argument. Specify '
          'AsIter(pcollection) or AsSingleton(pcollection) to indicate how the '
          'PCollection is to be used.')
    self.args, self.kwargs, self.side_inputs = util.remove_objects_from_args(
        args, kwargs, pvalue.PCollectionView)
    self.raw_side_inputs = args, kwargs

    # Prevent name collisions with fns of the form '<function <lambda> at ...>'
    self._cached_fn = self.fn

    # Ensure fn and side inputs are picklable for remote execution.
    self.fn = pickler.loads(pickler.dumps(self.fn))
    self.args = pickler.loads(pickler.dumps(self.args))
    self.kwargs = pickler.loads(pickler.dumps(self.kwargs))

    # For type hints, because loads(dumps(class)) != class.
    self.fn = self._cached_fn
Ejemplo n.º 3
0
 def test_create_do_with_side_in_memory_write(self):
   elements = ['abc', 'def', 'ghi']
   side_elements = ['x', 'y', 'z']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=3),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, side)]),
               tag_and_type=('inmemory', pvalue.SingletonPCollectionView,
                             (False, None))),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(
                   inmemory.InMemorySource(
                       elements=[pickler.dumps(e) for e in side_elements],
                       start_index=None,
                       end_index=None),
                   tag='inmemory')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(
           output_buffer=output_buffer,
           input=(1, 0),
           output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as singleton therefore we should see
   # only the first element appended.
   self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
Ejemplo n.º 4
0
 def test_nested_class(self):
   """Tests that a nested class object is pickled correctly."""
   self.assertEquals(
       'X:abc',
       loads(dumps(module_test.TopClass.NestedClass('abc'))).datum)
   self.assertEquals(
       'Y:abc',
       loads(dumps(module_test.TopClass.MiddleClass.NestedClass('abc'))).datum)
Ejemplo n.º 5
0
 def test_get_coder_with_composite_custom_coder(self):
   typecoders.registry.register_coder(CustomClass, CustomCoder)
   coder = typecoders.registry.get_coder(typehints.KV[CustomClass, str])
   revived_coder = pickler.loads(pickler.dumps(coder))
   self.assertEqual(
       (CustomClass(123), 'abc'),
       revived_coder.decode(revived_coder.encode((CustomClass(123), 'abc'))))
Ejemplo n.º 6
0
 def test_get_coder_with_composite_custom_coder(self):
     typecoders.registry.register_coder(CustomClass, CustomCoder)
     coder = typecoders.registry.get_coder(typehints.KV[CustomClass, str])
     revived_coder = pickler.loads(pickler.dumps(coder))
     self.assertEqual((CustomClass(123), 'abc'),
                      revived_coder.decode(
                          revived_coder.encode((CustomClass(123), 'abc'))))
 def run_GroupByKey(self, transform_node):
     input_tag = transform_node.inputs[0].tag
     input_step = self._cache.get_pvalue(transform_node.inputs[0])
     step = self._add_step(TransformNames.GROUP, transform_node.full_label,
                           transform_node)
     step.add_property(
         PropertyNames.PARALLEL_INPUT, {
             '@type': 'OutputReference',
             PropertyNames.STEP_NAME: input_step.proto.name,
             PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)
         })
     step.encoding = self._get_typehint_based_encoding(
         self._get_transform_type_hint(transform_node))
     step.add_property(PropertyNames.OUTPUT_INFO, [{
         PropertyNames.USER_NAME:
         ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
         PropertyNames.ENCODING:
         step.encoding,
         PropertyNames.OUTPUT_NAME:
         PropertyNames.OUT
     }])
     windowing = transform_node.transform.get_windowing(
         transform_node.inputs)
     step.add_property(PropertyNames.SERIALIZED_FN,
                       pickler.dumps(windowing))
Ejemplo n.º 8
0
    def test_create_do_avro_write(self):
        output_path = self.create_temp_file('n/a')
        elements = ['abc', 'def', 'ghi']
        work_item = workitem.BatchWorkItem(None)

        work_item.map_task = make_map_task([
            maptask.WorkerRead(
                inmemory.InMemorySource(
                    elements=[pickler.dumps(e) for e in elements],
                    start_index=2,  # Start at the last element.
                    end_index=3),
                output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=None,
                               output_coders=[self.OUTPUT_CODER]),
            make_text_sink(output_path,
                           input=(1, 0),
                           coder=coders.Base64PickleCoder())
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
Ejemplo n.º 9
0
 def test_create_do_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               # Start at the last element.
               start_index=2,
               # Go beyond the end to test that case is handled.
               end_index=15),
           output_coders=[coders.ToStringCoder()]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi\n', f.read())
Ejemplo n.º 10
0
 def test_create_do_with_side_text_file_write(self):
   input_path = self.create_temp_file('x\ny\n')
   elements = ['aa', 'bb']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=2),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, s) for s in side]),
               tag_and_type=('textfile', pvalue.IterablePCollectionView, ())),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(fileio.TextFileSource(
                   file_path=input_path, start_offset=None, end_offset=None,
                   strip_trailing_newlines=True,
                   coder=coders.StrUtf8Coder()),
                                             tag='textfile')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as collection therefore we should see
   # all elements of the side source.
   self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                    sorted(output_buffer))
Ejemplo n.º 11
0
  def run_CombineValues(self, transform_node):
    transform = transform_node.transform
    input_tag = transform_node.inputs[0].tag
    input_step = self._cache.get_pvalue(transform_node.inputs[0])
    step = self._add_step(
        TransformNames.COMBINE, transform_node.full_label, transform_node)
    # Combiner functions do not take deferred side-inputs (i.e. PValues) and
    # therefore the code to handle extra args/kwargs is simpler than for the
    # DoFn's of the ParDo transform. In the last, empty argument is where
    # side inputs information would go.
    fn_data = (transform.fn, transform.args, transform.kwargs, ())
    step.add_property(PropertyNames.SERIALIZED_FN,
                      pickler.dumps(fn_data))
    step.add_property(
        PropertyNames.PARALLEL_INPUT,
        {'@type': 'OutputReference',
         PropertyNames.STEP_NAME: input_step.proto.name,
         PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)})
    # Note that the accumulator must not have a WindowedValue encoding, while
    # the output of this step does in fact have a WindowedValue encoding.
    accumulator_encoding = self._get_encoded_output_coder(transform_node,
                                                          window_value=False)
    output_encoding = self._get_encoded_output_coder(transform_node)

    step.encoding = output_encoding
    step.add_property(PropertyNames.ENCODING, accumulator_encoding)
    # Generate description for main output 'out.'
    outputs = []
    # Add the main output to the description.
    outputs.append(
        {PropertyNames.USER_NAME: (
            '%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
         PropertyNames.ENCODING: step.encoding,
         PropertyNames.OUTPUT_NAME: PropertyNames.OUT})
    step.add_property(PropertyNames.OUTPUT_INFO, outputs)
Ejemplo n.º 12
0
  def test_lambda_with_globals(self):
    """Tests that the globals of a function are preserved."""

    # The point of the test is that the lambda being called after unpickling
    # relies on having the re module being loaded.
    self.assertEquals(
        ['abc', 'def'],
        loads(dumps(module_test.get_lambda_with_globals()))('abc def'))
Ejemplo n.º 13
0
def pickle_with_side_inputs(fn, tag_and_type=None):
    tags_and_types = []
    args = []
    if tag_and_type is not None:
        args.append(util.ArgumentPlaceholder())
        tags_and_types.append(tag_and_type)
    return pickler.dumps(
        (fn, args, {}, tags_and_types, core.Windowing(window.GlobalWindows())))
Ejemplo n.º 14
0
def pickle_with_side_inputs(fn, tag_and_type=None):
  tags_and_types = []
  args = []
  if tag_and_type is not None:
    args.append(util.ArgumentPlaceholder())
    tags_and_types.append(tag_and_type)
  return pickler.dumps((fn, args, {}, tags_and_types,
                        core.Windowing(window.GlobalWindows())))
Ejemplo n.º 15
0
 def test_create_do_with_side_avro_file_write(self):
     input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x'))
     input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y'))
     elements = ['aa', 'bb']
     output_buffer = []
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(inmemory.InMemorySource(
                 elements=[pickler.dumps(e) for e in elements],
                 start_index=0,
                 end_index=2),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(
                 serialized_fn=pickle_with_side_inputs(
                     ptransform.CallableWrapperDoFn(
                         lambda x, side: ['%s:%s' % (x, s) for s in side]),
                     tag_and_type=('sometag',
                                   pvalue.IterablePCollectionView, ())),
                 output_tags=['out'],
                 input=(0, 0),
                 # Note that the two side inputs have the same tag. This is quite
                 # common for intermediary PCollections used as side inputs that
                 # are saved as AVRO files. The files will contain the sharded
                 # PCollection.
                 side_inputs=[
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path1,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag'),
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path2,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag')
                 ],
                 output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerInMemoryWrite(
                 output_buffer=output_buffer,
                 input=(1, 0),
                 output_coders=(self.OUTPUT_CODER, ))
         ]))
     # The side source was specified as collection therefore we should see
     # all three elements of the side source.
     self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                      sorted(output_buffer))
Ejemplo n.º 16
0
 def test_create_do_with_side_avro_file_write(self):
   input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x'))
   input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y'))
   elements = ['aa', 'bb']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=2),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, s) for s in side]),
               tag_and_type=(
                   'sometag', False)),  # False => type is collection.
           output_tags=['out'], input=(0, 0),
           # Note that the two side inputs have the same tag. This is quite
           # common for intermediary PCollections used as side inputs that
           # are saved as AVRO files. The files will contain the sharded
           # PCollection.
           side_inputs=[
               maptask.WorkerSideInputSource(
                   fileio.TextFileSource(
                       file_path=input_path1,
                       coder=coders.Base64PickleCoder()),
                   tag='sometag'),
               maptask.WorkerSideInputSource(
                   fileio.TextFileSource(file_path=input_path2,
                                         coder=coders.Base64PickleCoder()),
                   tag='sometag')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(
           output_buffer=output_buffer,
           input=(1, 0),
           output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as collection therefore we should see
   # all three elements of the side source.
   self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                    sorted(output_buffer))
Ejemplo n.º 17
0
 def test_pgbk(self):
   elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements
                                            ],
                                   start_index=0,
                                   end_index=100),
           tag=None), maptask.WorkerPartialGroupByKey(input=(
               0, 0)), maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                                   input=(1, 0))
   ]))
   self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
Ejemplo n.º 18
0
    def test_create_do_with_collection_side_bigquery_write(self):
        elements = ['aa', 'bb']
        side_elements = ['x', 'y']
        output_buffer = []
        patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
        with mock.patch(target=patch_target) as mock_class:
            # Setup the reader so it will yield the values in 'side_elements'.
            reader_mock = mock_class.return_value
            reader_mock.__enter__.return_value = reader_mock
            # Use a lambda so that multiple readers can be created, each reading the
            # entirety of the side elements.
            reader_mock.__iter__.side_effect = lambda: (x
                                                        for x in side_elements)

            executor.MapTaskExecutor().execute(
                make_map_task([
                    maptask.WorkerRead(inmemory.InMemorySource(
                        elements=[pickler.dumps(e) for e in elements],
                        start_index=0,
                        end_index=3),
                                       output_coders=[self.OUTPUT_CODER]),
                    maptask.
                    WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                        ptransform.CallableWrapperDoFn(
                            lambda x, side: ['%s:%s' % (x, s) for s in side]),
                        tag_and_type=('bigquery',
                                      pvalue.IterablePCollectionView, ())),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=[
                                   maptask.WorkerSideInputSource(
                                       bigquery.BigQuerySource(
                                           project='project',
                                           dataset='dataset',
                                           table='table',
                                           coder=get_bigquery_source_coder()),
                                       tag='bigquery')
                               ],
                               output_coders=[self.OUTPUT_CODER]),
                    maptask.WorkerInMemoryWrite(
                        output_buffer=output_buffer,
                        input=(1, 0),
                        output_coders=(self.OUTPUT_CODER, ))
                ]))
        # The side source was specified as collection therefore we should see
        # all elements of the side source.
        self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'],
                         sorted(output_buffer))
Ejemplo n.º 19
0
    def test_in_memory_source_progress_reporting(self):
        elements = [101, 201, 301, 401, 501, 601, 701]
        output_buffer = []
        source = ProgressRequestRecordingInMemorySource(
            elements=[pickler.dumps(e) for e in elements])
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                        input=(0, 0),
                                        output_coders=(self.OUTPUT_CODER, ))
        ])
        executor.MapTaskExecutor(work_item.map_task).execute()
        self.assertEqual(elements, output_buffer)

        expected_progress_record = range(len(elements))
        self.assertEqual(expected_progress_record,
                         source.last_reader.progress_record)
Ejemplo n.º 20
0
 def test_combine(self):
   elements = [('a', [1, 2, 3]), ('b', [10])]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=100),
           tag=None),
       maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CombineFn.from_callable(sum)),
                               phase='all',
                               input=(0, 0)), maptask.WorkerInMemoryWrite(
                                   output_buffer=output_buffer,
                                   input=(1, 0))
   ]))
   self.assertEqual([('a', 6), ('b', 10)], output_buffer)
Ejemplo n.º 21
0
  def test_create_do_with_collection_side_bigquery_write(self):
    elements = ['aa', 'bb']
    side_elements = ['x', 'y']
    output_buffer = []
    patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
    with mock.patch(target=patch_target) as mock_class:
      # Setup the reader so it will yield the values in 'side_elements'.
      reader_mock = mock_class.return_value
      reader_mock.__enter__.return_value = reader_mock
      # Use a lambda so that multiple readers can be created, each reading the
      # entirety of the side elements.
      reader_mock.__iter__.side_effect = lambda: (x for x in side_elements)

      executor.MapTaskExecutor().execute(make_map_task([
          maptask.WorkerRead(
              inmemory.InMemorySource(
                  elements=[pickler.dumps(e) for e in elements],
                  start_index=0,
                  end_index=3),
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerDoFn(
              serialized_fn=pickle_with_side_inputs(
                  ptransform.CallableWrapperDoFn(
                      lambda x, side: ['%s:%s' % (x, s) for s in side]),
                  tag_and_type=('bigquery', pvalue.IterablePCollectionView,
                                ())),
              output_tags=['out'], input=(0, 0),
              side_inputs=[
                  maptask.WorkerSideInputSource(
                      bigquery.BigQuerySource(
                          project='project',
                          dataset='dataset',
                          table='table',
                          coder=get_bigquery_source_coder()),
                      tag='bigquery')],
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerInMemoryWrite(
              output_buffer=output_buffer,
              input=(1, 0),
              output_coders=(self.OUTPUT_CODER,))]))
    # The side source was specified as collection therefore we should see
    # all elements of the side source.
    self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'],
                     sorted(output_buffer))
Ejemplo n.º 22
0
 def test_pgbk(self):
   elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements
                                            ],
                                   start_index=0,
                                   end_index=100),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerPartialGroupByKey(
           combine_fn=None,
           input=(0, 0),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))
   ]))
   self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
Ejemplo n.º 23
0
 def test_create_do_avro_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=2,  # Start at the last element.
               end_index=3),
           tag=None),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
           output_tags=['out'], input=(0, 0), side_inputs=None),
       maptask.WorkerWrite(fileio.TextFileSink(
           file_path_prefix=output_path,
           append_trailing_newlines=True,
           coder=coders.Base64PickleCoder()), input=(1, 0))]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
Ejemplo n.º 24
0
 def test_combine(self):
   elements = [('a', [1, 2, 3]), ('b', [10])]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=100),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CombineFn.from_callable(sum)),
                               phase='all',
                               input=(0, 0),
                               output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))
   ]))
   self.assertEqual([('a', 6), ('b', 10)], output_buffer)
Ejemplo n.º 25
0
  def test_in_memory_source_progress_reporting(self):
    elements = [101, 201, 301, 401, 501, 601, 701]
    output_buffer = []
    source = ProgressRequestRecordingInMemorySource(
        elements=[pickler.dumps(e) for e in elements])
    executor.MapTaskExecutor().execute(make_map_task([
        maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]),
        maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                    input=(0, 0),
                                    output_coders=(self.OUTPUT_CODER,))
    ]))
    self.assertEqual(elements, output_buffer)

    expected_progress_record = []
    len_elements = len(elements)
    for i in range(len_elements):
      expected_progress_record.append(float(i + 1) / len_elements)

    self.assertEqual(expected_progress_record,
                     source.last_reader.progress_record)
Ejemplo n.º 26
0
 def run_GroupByKey(self, transform_node):
   input_tag = transform_node.inputs[0].tag
   input_step = self._cache.get_pvalue(transform_node.inputs[0])
   step = self._add_step(
       TransformNames.GROUP, transform_node.full_label, transform_node)
   step.add_property(
       PropertyNames.PARALLEL_INPUT,
       {'@type': 'OutputReference',
        PropertyNames.STEP_NAME: input_step.proto.name,
        PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)})
   step.encoding = self._get_encoded_output_coder(transform_node)
   step.add_property(
       PropertyNames.OUTPUT_INFO,
       [{PropertyNames.USER_NAME: (
           '%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
         PropertyNames.ENCODING: step.encoding,
         PropertyNames.OUTPUT_NAME: PropertyNames.OUT}])
   windowing = transform_node.transform.get_windowing(
       transform_node.inputs)
   step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(windowing))
Ejemplo n.º 27
0
  def test_create_do_with_singleton_side_bigquery_write(self):
    elements = ['abc', 'def', 'ghi']
    side_elements = ['x', 'y', 'z']
    output_buffer = []
    patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
    with mock.patch(target=patch_target) as mock_class:
      # Setup the reader so it will yield the values in 'side_elements'.
      reader_mock = mock_class.return_value
      reader_mock.__enter__.return_value = reader_mock
      reader_mock.__iter__.return_value = (x for x in side_elements)

      pickled_elements = [pickler.dumps(e) for e in elements]
      executor.MapTaskExecutor().execute(make_map_task([
          maptask.WorkerRead(
              inmemory.InMemorySource(elements=pickled_elements,
                                      start_index=0,
                                      end_index=3),
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerDoFn(
              serialized_fn=pickle_with_side_inputs(
                  ptransform.CallableWrapperDoFn(
                      lambda x, side: ['%s:%s' % (x, side)]),
                  tag_and_type=('bigquery', pvalue.SingletonPCollectionView,
                                (False, None))),
              output_tags=['out'], input=(0, 0),
              side_inputs=[
                  maptask.WorkerSideInputSource(
                      bigquery.BigQuerySource(
                          project='project',
                          dataset='dataset',
                          table='table',
                          coder=get_bigquery_source_coder()),
                      tag='bigquery')],
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerInMemoryWrite(
              output_buffer=output_buffer,
              input=(1, 0),
              output_coders=(self.OUTPUT_CODER,))]))
    # The side source was specified as singleton therefore we should see
    # only the first element appended.
    self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
Ejemplo n.º 28
0
def splits_to_split_response(bundles):
    """Generates a response to a custom source split request.

  Args:
    bundles: a set of bundles generated by a BoundedSource.split() invocation.
  Returns:
   a SourceOperationResponse object.
  """
    derived_sources = []
    for bundle in bundles:
        derived_source = dataflow.DerivedSource()
        derived_source.derivationMode = (
            dataflow.DerivedSource.DerivationModeValueValuesEnum.
            SOURCE_DERIVATION_MODE_INDEPENDENT)
        derived_source.source = dataflow.Source()
        derived_source.source.doesNotNeedSplitting = True

        derived_source.source.spec = dataflow.Source.SpecValue()
        derived_source.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key=names.SERIALIZED_SOURCE_KEY,
                value=to_json_value(pickler.dumps(
                    (bundle.source, bundle.start_position,
                     bundle.stop_position)),
                                    with_type=True)))
        derived_source.source.spec.additionalProperties.append(
            dataflow.Source.SpecValue.AdditionalProperty(
                key='@type', value=to_json_value(names.SOURCE_TYPE)))
        derived_sources.append(derived_source)

    split_response = dataflow.SourceSplitResponse()
    split_response.bundles = derived_sources
    split_response.outcome = (
        dataflow.SourceSplitResponse.OutcomeValueValuesEnum.
        SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED)

    response = dataflow.SourceOperationResponse()
    response.split = split_response
    return response
Ejemplo n.º 29
0
def splits_to_split_response(bundles):
  """Generates a response to a custom source split request.

  Args:
    bundles: a set of bundles generated by a BoundedSource.split() invocation.
  Returns:
   a SourceOperationResponse object.
  """
  derived_sources = []
  for bundle in bundles:
    derived_source = dataflow.DerivedSource()
    derived_source.derivationMode = (
        dataflow.DerivedSource.DerivationModeValueValuesEnum
        .SOURCE_DERIVATION_MODE_INDEPENDENT)
    derived_source.source = dataflow.Source()
    derived_source.source.doesNotNeedSplitting = True

    derived_source.source.spec = dataflow.Source.SpecValue()
    derived_source.source.spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key=names.SERIALIZED_SOURCE_KEY,
            value=to_json_value(pickler.dumps(
                (bundle.source, bundle.start_position, bundle.stop_position)),
                                with_type=True)))
    derived_source.source.spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(key='@type',
                                                     value=to_json_value(
                                                         names.SOURCE_TYPE)))
    derived_sources.append(derived_source)

  split_response = dataflow.SourceSplitResponse()
  split_response.bundles = derived_sources
  split_response.outcome = (
      dataflow.SourceSplitResponse.OutcomeValueValuesEnum
      .SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED)

  response = dataflow.SourceOperationResponse()
  response.split = split_response
  return response
Ejemplo n.º 30
0
    def run_CombineValues(self, transform_node):
        transform = transform_node.transform
        input_tag = transform_node.inputs[0].tag
        input_step = self._cache.get_pvalue(transform_node.inputs[0])
        step = self._add_step(TransformNames.COMBINE,
                              transform_node.full_label, transform_node)
        # Combiner functions do not take deferred side-inputs (i.e. PValues) and
        # therefore the code to handle extra args/kwargs is simpler than for the
        # DoFn's of the ParDo transform. In the last, empty argument is where
        # side inputs information would go.
        fn_data = (transform.fn, transform.args, transform.kwargs, ())
        step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(fn_data))
        step.add_property(
            PropertyNames.PARALLEL_INPUT, {
                '@type': 'OutputReference',
                PropertyNames.STEP_NAME: input_step.proto.name,
                PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)
            })
        # Note that the accumulator must not have a WindowedValue encoding, while
        # the output of this step does in fact have a WindowedValue encoding.
        accumulator_encoding = self._get_typehint_based_encoding(
            self._get_transform_type_hint(transform_node), window_value=False)
        output_encoding = self._get_typehint_based_encoding(
            self._get_transform_type_hint(transform_node))

        step.encoding = output_encoding
        step.add_property(PropertyNames.ENCODING, accumulator_encoding)
        # Generate description for main output 'out.'
        outputs = []
        # Add the main output to the description.
        outputs.append({
            PropertyNames.USER_NAME:
            ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
            PropertyNames.ENCODING:
            step.encoding,
            PropertyNames.OUTPUT_NAME:
            PropertyNames.OUT
        })
        step.add_property(PropertyNames.OUTPUT_INFO, outputs)
Ejemplo n.º 31
0
  def build_split_proto(self, bounded_source, desired_bundle_size):
    split_proto = dataflow.SourceSplitRequest()
    split_proto.options = dataflow.SourceSplitOptions()
    split_proto.options.desiredBundleSizeBytes = desired_bundle_size

    source = dataflow.Source()
    spec = dataflow.Source.SpecValue()

    if bounded_source:
      spec.additionalProperties.append(
          dataflow.Source.SpecValue.AdditionalProperty(
              key=names.SERIALIZED_SOURCE_KEY,
              value=to_json_value({'value': pickler.dumps(bounded_source),
                                   '@type': 'http://schema.org/Text'})))
    spec.additionalProperties.append(
        dataflow.Source.SpecValue.AdditionalProperty(
            key='@type',
            value=to_json_value('CustomSourcesType')))
    source.spec = spec
    split_proto.source = source

    return split_proto
Ejemplo n.º 32
0
 def test_dynamic_class(self):
   """Tests that a nested class object is pickled correctly."""
   self.assertEquals(
       'Z:abc',
       loads(dumps(module_test.create_class('abc'))).get())
Ejemplo n.º 33
0
 def test_generators(self):
   with self.assertRaises(TypeError):
     dumps((_ for _ in range(10)))
Ejemplo n.º 34
0
 def test_object(self):
   """Tests that a class instance is pickled correctly."""
   self.assertEquals(
       ['abc', 'def'],
       loads(dumps(module_test.XYZ_OBJECT)).foo('abc def'))
Ejemplo n.º 35
0
 def test_get_coder_can_be_pickled(self):
     coder = typecoders.registry.get_coder(typehints.Tuple[str, int])
     revived_coder = pickler.loads(pickler.dumps(coder))
     self.assertEqual(
         ('abc', 123),
         revived_coder.decode(revived_coder.encode(('abc', 123))))
Ejemplo n.º 36
0
 def test_lambda_with_closure(self):
   """Tests that the closure of a function is preserved."""
   self.assertEquals(
       'closure: abc',
       loads(dumps(module_test.get_lambda_with_closure('abc')))())
Ejemplo n.º 37
0
 def test_class(self):
   """Tests that a class object is pickled correctly."""
   self.assertEquals(
       ['abc', 'def'],
       loads(dumps(module_test.Xyz))().foo('abc def'))
Ejemplo n.º 38
0
 def test_basics(self):
   self.assertEquals([1, 'a', (u'z',)], loads(dumps([1, 'a', (u'z',)])))
   fun = lambda x: 'xyz-%s' % x
   self.assertEquals('xyz-abc', loads(dumps(fun))('abc'))
Ejemplo n.º 39
0
 def test_get_coder_can_be_pickled(self):
   coder = typecoders.registry.get_coder(typehints.Tuple[str, int])
   revived_coder = pickler.loads(pickler.dumps(coder))
   self.assertEqual(('abc', 123),
                    revived_coder.decode(revived_coder.encode(('abc', 123))))
Ejemplo n.º 40
0
  def run_Read(self, transform_node):
    transform = transform_node.transform
    step = self._add_step(
        TransformNames.READ, transform_node.full_label, transform_node)
    # TODO(mairbek): refactor if-else tree to use registerable functions.
    # Initialize the source specific properties.

    if not hasattr(transform.source, 'format'):
      # If a format is not set, we assume the source to be a custom source.
      source_dict = dict()
      spec_dict = dict()

      spec_dict[names.SERIALIZED_SOURCE_KEY] = pickler.dumps(transform.source)
      spec_dict['@type'] = names.SOURCE_TYPE
      source_dict['spec'] = spec_dict
      step.add_property(PropertyNames.SOURCE_STEP_INPUT,
                        source_dict)
    elif transform.source.format == 'text':
      step.add_property(PropertyNames.FILE_PATTERN, transform.source.path)
    elif transform.source.format == 'bigquery':
      # TODO(silviuc): Add table validation if transform.source.validate.
      if transform.source.table_reference is not None:
        step.add_property(PropertyNames.BIGQUERY_DATASET,
                          transform.source.table_reference.datasetId)
        step.add_property(PropertyNames.BIGQUERY_TABLE,
                          transform.source.table_reference.tableId)
        # If project owning the table was not specified then the project owning
        # the workflow (current project) will be used.
        if transform.source.table_reference.projectId is not None:
          step.add_property(PropertyNames.BIGQUERY_PROJECT,
                            transform.source.table_reference.projectId)
      elif transform.source.query is not None:
        step.add_property(PropertyNames.BIGQUERY_QUERY, transform.source.query)
      else:
        raise ValueError('BigQuery source %r must specify either a table or'
                         ' a query',
                         transform.source)
    elif transform.source.format == 'pubsub':
      standard_options = (
          transform_node.inputs[0].pipeline.options.view_as(StandardOptions))
      if not standard_options.streaming:
        raise ValueError('PubSubSource is currently available for use only in '
                         'streaming pipelines.')
      step.add_property(PropertyNames.PUBSUB_TOPIC, transform.source.topic)
      if transform.source.subscription:
        step.add_property(PropertyNames.PUBSUB_SUBSCRIPTION,
                          transform.source.topic)
      if transform.source.id_label:
        step.add_property(PropertyNames.PUBSUB_ID_LABEL,
                          transform.source.id_label)
    else:
      raise ValueError(
          'Source %r has unexpected format %s.' % (
              transform.source, transform.source.format))

    if not hasattr(transform.source, 'format'):
      step.add_property(PropertyNames.FORMAT, names.SOURCE_FORMAT)
    else:
      step.add_property(PropertyNames.FORMAT, transform.source.format)

    if isinstance(transform.source, iobase.BoundedSource):
      coder = transform.source.default_output_coder()
    else:
      coder = transform.source.coder

    step.encoding = self._get_cloud_encoding(coder)
    step.add_property(
        PropertyNames.OUTPUT_INFO,
        [{PropertyNames.USER_NAME: (
            '%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
          PropertyNames.ENCODING: step.encoding,
          PropertyNames.OUTPUT_NAME: PropertyNames.OUT}])
Ejemplo n.º 41
0
  def run_ParDo(self, transform_node):
    transform = transform_node.transform
    input_tag = transform_node.inputs[0].tag
    input_step = self._cache.get_pvalue(transform_node.inputs[0])

    # Attach side inputs.
    si_dict = {}
    si_tags_and_types = []
    for side_pval in transform_node.side_inputs:
      assert isinstance(side_pval, PCollectionView)
      side_input_step = self._cache.get_pvalue(side_pval)
      si_label = side_input_step.step_name
      si_dict[si_label] = {
          '@type': 'OutputReference',
          PropertyNames.STEP_NAME: si_label,
          PropertyNames.OUTPUT_NAME: PropertyNames.OUT}
      # The label for the side input step will appear as a 'tag' property for
      # the side input source specification. Its type (singleton or iterator)
      # will also be used to read the entire source or just first element.
      si_tags_and_types.append((si_label, side_pval.__class__,
                                side_pval._view_options()))  # pylint: disable=protected-access

    # Now create the step for the ParDo transform being handled.
    step = self._add_step(
        TransformNames.DO, transform_node.full_label, transform_node,
        transform_node.transform.side_output_tags)
    fn_data = (transform.fn, transform.args, transform.kwargs,
               si_tags_and_types, transform_node.inputs[0].windowing)
    step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(fn_data))
    step.add_property(
        PropertyNames.PARALLEL_INPUT,
        {'@type': 'OutputReference',
         PropertyNames.STEP_NAME: input_step.proto.name,
         PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)})
    # Add side inputs if any.
    step.add_property(PropertyNames.NON_PARALLEL_INPUTS, si_dict)

    # Generate description for main output and side outputs. The output names
    # will be 'out' for main output and 'out_<tag>' for a tagged output.
    # Using 'out' as a tag will not clash with the name for main since it will
    # be transformed into 'out_out' internally.
    outputs = []
    step.encoding = self._get_encoded_output_coder(transform_node)

    # Add the main output to the description.
    outputs.append(
        {PropertyNames.USER_NAME: (
            '%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
         PropertyNames.ENCODING: step.encoding,
         PropertyNames.OUTPUT_NAME: PropertyNames.OUT})
    for side_tag in transform.side_output_tags:
      # The assumption here is that side outputs will have the same typehint
      # and coder as the main output. This is certainly the case right now
      # but conceivably it could change in the future.
      outputs.append(
          {PropertyNames.USER_NAME: (
              '%s.%s' % (transform_node.full_label, side_tag)),
           PropertyNames.ENCODING: step.encoding,
           PropertyNames.OUTPUT_NAME: (
               '%s_%s' % (PropertyNames.OUT, side_tag))})
    step.add_property(PropertyNames.OUTPUT_INFO, outputs)
Ejemplo n.º 42
0
    def run_ParDo(self, transform_node):
        transform = transform_node.transform
        input_tag = transform_node.inputs[0].tag
        input_step = self._cache.get_pvalue(transform_node.inputs[0])

        # Attach side inputs.
        si_dict = {}
        si_tags_and_types = []
        for side_pval in transform_node.side_inputs:
            assert isinstance(side_pval, PCollectionView)
            side_input_step = self._cache.get_pvalue(side_pval)
            si_label = side_input_step.step_name
            si_dict[si_label] = {
                '@type': 'OutputReference',
                PropertyNames.STEP_NAME: si_label,
                PropertyNames.OUTPUT_NAME: PropertyNames.OUT
            }
            # The label for the side input step will appear as a 'tag' property for
            # the side input source specification. Its type (singleton or iterator)
            # will also be used to read the entire source or just first element.
            si_tags_and_types.append(
                (si_label, side_pval.__class__, side_pval._view_options()))  # pylint: disable=protected-access

        # Now create the step for the ParDo transform being handled.
        step = self._add_step(TransformNames.DO, transform_node.full_label,
                              transform_node,
                              transform_node.transform.side_output_tags)
        fn_data = (transform.fn, transform.args, transform.kwargs,
                   si_tags_and_types, transform_node.inputs[0].windowing)
        step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(fn_data))
        step.add_property(
            PropertyNames.PARALLEL_INPUT, {
                '@type': 'OutputReference',
                PropertyNames.STEP_NAME: input_step.proto.name,
                PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)
            })
        # Add side inputs if any.
        step.add_property(PropertyNames.NON_PARALLEL_INPUTS, si_dict)

        # Generate description for main output and side outputs. The output names
        # will be 'out' for main output and 'out_<tag>' for a tagged output.
        # Using 'out' as a tag will not clash with the name for main since it will
        # be transformed into 'out_out' internally.
        outputs = []
        step.encoding = self._get_encoded_output_coder(transform_node)

        # Add the main output to the description.
        outputs.append({
            PropertyNames.USER_NAME:
            ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
            PropertyNames.ENCODING:
            step.encoding,
            PropertyNames.OUTPUT_NAME:
            PropertyNames.OUT
        })
        for side_tag in transform.side_output_tags:
            # The assumption here is that side outputs will have the same typehint
            # and coder as the main output. This is certainly the case right now
            # but conceivably it could change in the future.
            outputs.append({
                PropertyNames.USER_NAME:
                ('%s.%s' % (transform_node.full_label, side_tag)),
                PropertyNames.ENCODING:
                step.encoding,
                PropertyNames.OUTPUT_NAME:
                ('%s_%s' % (PropertyNames.OUT, side_tag))
            })
        step.add_property(PropertyNames.OUTPUT_INFO, outputs)
Ejemplo n.º 43
0
def serialize_coder(coder):
  from google.cloud.dataflow.internal import pickler
  return '%s$%s' % (coder.__class__.__name__, pickler.dumps(coder))
Ejemplo n.º 44
0
    def run_Read(self, transform_node):
        transform = transform_node.transform
        step = self._add_step(TransformNames.READ, transform_node.full_label,
                              transform_node)
        # TODO(mairbek): refactor if-else tree to use registerable functions.
        # Initialize the source specific properties.

        if not hasattr(transform.source, 'format'):
            # If a format is not set, we assume the source to be a custom source.
            source_dict = dict()
            spec_dict = dict()

            spec_dict[names.SERIALIZED_SOURCE_KEY] = pickler.dumps(
                transform.source)
            spec_dict['@type'] = names.SOURCE_TYPE
            source_dict['spec'] = spec_dict
            step.add_property(PropertyNames.SOURCE_STEP_INPUT, source_dict)
        elif transform.source.format == 'text':
            step.add_property(PropertyNames.FILE_PATTERN,
                              transform.source.path)
        elif transform.source.format == 'bigquery':
            # TODO(silviuc): Add table validation if transform.source.validate.
            if transform.source.table_reference is not None:
                step.add_property(PropertyNames.BIGQUERY_DATASET,
                                  transform.source.table_reference.datasetId)
                step.add_property(PropertyNames.BIGQUERY_TABLE,
                                  transform.source.table_reference.tableId)
                # If project owning the table was not specified then the project owning
                # the workflow (current project) will be used.
                if transform.source.table_reference.projectId is not None:
                    step.add_property(
                        PropertyNames.BIGQUERY_PROJECT,
                        transform.source.table_reference.projectId)
            elif transform.source.query is not None:
                step.add_property(PropertyNames.BIGQUERY_QUERY,
                                  transform.source.query)
            else:
                raise ValueError(
                    'BigQuery source %r must specify either a table or'
                    ' a query', transform.source)
        elif transform.source.format == 'pubsub':
            standard_options = (transform_node.inputs[0].pipeline.options.
                                view_as(StandardOptions))
            if not standard_options.streaming:
                raise ValueError(
                    'PubSubSource is currently available for use only in '
                    'streaming pipelines.')
            step.add_property(PropertyNames.PUBSUB_TOPIC,
                              transform.source.topic)
            if transform.source.subscription:
                step.add_property(PropertyNames.PUBSUB_SUBSCRIPTION,
                                  transform.source.topic)
            if transform.source.id_label:
                step.add_property(PropertyNames.PUBSUB_ID_LABEL,
                                  transform.source.id_label)
        else:
            raise ValueError('Source %r has unexpected format %s.' %
                             (transform.source, transform.source.format))

        if not hasattr(transform.source, 'format'):
            step.add_property(PropertyNames.FORMAT, names.SOURCE_FORMAT)
        else:
            step.add_property(PropertyNames.FORMAT, transform.source.format)

        if isinstance(transform.source, iobase.BoundedSource):
            coder = transform.source.default_output_coder()
        else:
            coder = transform.source.coder

        step.encoding = self._get_cloud_encoding(coder)
        step.add_property(PropertyNames.OUTPUT_INFO, [{
            PropertyNames.USER_NAME:
            ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
            PropertyNames.ENCODING:
            step.encoding,
            PropertyNames.OUTPUT_NAME:
            PropertyNames.OUT
        }])