def __init__(self, fn_or_label, *args, **kwargs):
    if fn_or_label is None or isinstance(fn_or_label, basestring):
      label = fn_or_label
      fn, args = args[0], args[1:]
    else:
      label = None
      fn = fn_or_label
    if isinstance(fn, type) and issubclass(fn, typehints.WithTypeHints):
      # Don't treat Fn class objects as callables.
      raise ValueError('Use %s() not %s.' % (fn.__name__, fn.__name__))
    self.fn = self.make_fn(fn)
    # Now that we figure out the label, initialize the super-class.
    super(PTransformWithSideInputs, self).__init__(label=label)

    if (any([isinstance(v, pvalue.PCollection) for v in args]) or
        any([isinstance(v, pvalue.PCollection) for v in kwargs.itervalues()])):
      raise error.SideInputError(
          'PCollection used directly as side input argument. Specify '
          'AsIter(pcollection) or AsSingleton(pcollection) to indicate how the '
          'PCollection is to be used.')
    self.args, self.kwargs, self.side_inputs = util.remove_objects_from_args(
        args, kwargs, pvalue.PCollectionView)
    self.raw_side_inputs = args, kwargs

    # Prevent name collisions with fns of the form '<function <lambda> at ...>'
    self._cached_fn = self.fn

    # Ensure fn and side inputs are picklable for remote execution.
    self.fn = pickler.loads(pickler.dumps(self.fn))
    self.args = pickler.loads(pickler.dumps(self.args))
    self.kwargs = pickler.loads(pickler.dumps(self.kwargs))

    # For type hints, because loads(dumps(class)) != class.
    self.fn = self._cached_fn
    def __init__(self, fn_or_label, *args, **kwargs):
        if fn_or_label is None or isinstance(fn_or_label, basestring):
            label = fn_or_label
            fn, args = args[0], args[1:]
        else:
            label = None
            fn = fn_or_label
        if isinstance(fn, type) and issubclass(fn, typehints.WithTypeHints):
            # Don't treat Fn class objects as callables.
            raise ValueError('Use %s() not %s.' % (fn.__name__, fn.__name__))
        self.fn = self.make_fn(fn)
        # Now that we figure out the label, initialize the super-class.
        super(PTransformWithSideInputs, self).__init__(label=label)

        if (any([isinstance(v, pvalue.PCollection) for v in args]) or any(
            [isinstance(v, pvalue.PCollection) for v in kwargs.itervalues()])):
            raise error.SideInputError(
                'PCollection used directly as side input argument. Specify '
                'AsIter(pcollection) or AsSingleton(pcollection) to indicate how the '
                'PCollection is to be used.')
        self.args, self.kwargs, self.side_inputs = util.remove_objects_from_args(
            args, kwargs, pvalue.PCollectionView)
        self.raw_side_inputs = args, kwargs

        # Prevent name collisions with fns of the form '<function <lambda> at ...>'
        self._cached_fn = self.fn

        # Ensure fn and side inputs are picklable for remote execution.
        self.fn = pickler.loads(pickler.dumps(self.fn))
        self.args = pickler.loads(pickler.dumps(self.args))
        self.kwargs = pickler.loads(pickler.dumps(self.kwargs))

        # For type hints, because loads(dumps(class)) != class.
        self.fn = self._cached_fn
 def test_nested_class(self):
   """Tests that a nested class object is pickled correctly."""
   self.assertEquals(
       'X:abc',
       loads(dumps(module_test.TopClass.NestedClass('abc'))).datum)
   self.assertEquals(
       'Y:abc',
       loads(dumps(module_test.TopClass.MiddleClass.NestedClass('abc'))).datum)
 def __init__(self, spec):
   super(BatchGroupAlsoByWindowsOperation, self).__init__(spec)
   self.windowing = pickler.loads(self.spec.window_fn)
   if self.spec.combine_fn:
     # Combiners do not accept deferred side-inputs (the ignored fourth
     # argument) and therefore the code to handle the extra args/kwargs is
     # simpler than for the DoFn's of ParDo.
     fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3]
     self.phased_combine_fn = (
         PhasedCombineFnExecutor(self.spec.phase, fn, args, kwargs))
   else:
     self.phased_combine_fn = None
 def __init__(self, spec, counter_factory):
     super(BatchGroupAlsoByWindowsOperation,
           self).__init__(spec, counter_factory)
     self.windowing = pickler.loads(self.spec.window_fn)
     if self.spec.combine_fn:
         # Combiners do not accept deferred side-inputs (the ignored fourth
         # argument) and therefore the code to handle the extra args/kwargs is
         # simpler than for the DoFn's of ParDo.
         fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3]
         self.phased_combine_fn = (PhasedCombineFnExecutor(
             self.spec.phase, fn, args, kwargs))
     else:
         self.phased_combine_fn = None
Exemple #6
0
  def start(self):
    super(DoOperation, self).start()

    # See fn_data in dataflow_runner.py
    fn, args, kwargs, tags_and_types, window_fn = (
        pickler.loads(self.spec.serialized_fn))

    self.state.step_name = self.step_name

    # TODO(silviuc): What is the proper label here? PCollection being processed?
    self.context = ptransform.DoFnProcessContext('label', state=self.state)
    # Tag to output index map used to dispatch the side output values emitted
    # by the DoFn function to the appropriate receivers. The main output is
    # tagged with None and is associated with its corresponding index.
    tagged_receivers = {}
    output_tag_prefix = PropertyNames.OUT + '_'
    for index, tag in enumerate(self.spec.output_tags):
      if tag == PropertyNames.OUT:
        original_tag = None
      elif tag.startswith(output_tag_prefix):
        original_tag = tag[len(output_tag_prefix):]
      else:
        raise ValueError('Unexpected output name for operation: %s' % tag)
      tagged_receivers[original_tag] = self.receivers[index]

    self.dofn_runner = common.DoFnRunner(
        fn, args, kwargs, self._read_side_inputs(tags_and_types),
        window_fn, self.context, tagged_receivers,
        logger, self.step_name)

    self.dofn_runner.start()
 def test_get_coder_with_composite_custom_coder(self):
   typecoders.registry.register_coder(CustomClass, CustomCoder)
   coder = typecoders.registry.get_coder(typehints.KV[CustomClass, str])
   revived_coder = pickler.loads(pickler.dumps(coder))
   self.assertEqual(
       (CustomClass(123), 'abc'),
       revived_coder.decode(revived_coder.encode((CustomClass(123), 'abc'))))
Exemple #8
0
 def test_get_coder_with_composite_custom_coder(self):
     typecoders.registry.register_coder(CustomClass, CustomCoder)
     coder = typecoders.registry.get_coder(typehints.KV[CustomClass, str])
     revived_coder = pickler.loads(pickler.dumps(coder))
     self.assertEqual((CustomClass(123), 'abc'),
                      revived_coder.decode(
                          revived_coder.encode((CustomClass(123), 'abc'))))
    def start(self):
        super(DoOperation, self).start()

        # See fn_data in dataflow_runner.py
        fn, args, kwargs, tags_and_types, window_fn = (pickler.loads(
            self.spec.serialized_fn))

        self.state.step_name = self.step_name

        # TODO(silviuc): What is the proper label here? PCollection being processed?
        self.context = ptransform.DoFnProcessContext('label', state=self.state)
        # Tag to output index map used to dispatch the side output values emitted
        # by the DoFn function to the appropriate receivers. The main output is
        # tagged with None and is associated with its corresponding index.
        tagged_receivers = {}
        output_tag_prefix = PropertyNames.OUT + '_'
        for index, tag in enumerate(self.spec.output_tags):
            if tag == PropertyNames.OUT:
                original_tag = None
            elif tag.startswith(output_tag_prefix):
                original_tag = tag[len(output_tag_prefix):]
            else:
                raise ValueError('Unexpected output name for operation: %s' %
                                 tag)
            tagged_receivers[original_tag] = self.receivers[index]

        self.dofn_runner = common.DoFnRunner(
            fn, args, kwargs, self._read_side_inputs(tags_and_types),
            window_fn, self.context, tagged_receivers, logger, self.step_name)

        self.dofn_runner.start()
    def test_create_do_avro_write(self):
        output_path = self.create_temp_file('n/a')
        elements = ['abc', 'def', 'ghi']
        work_item = workitem.BatchWorkItem(None)

        work_item.map_task = make_map_task([
            maptask.WorkerRead(
                inmemory.InMemorySource(
                    elements=[pickler.dumps(e) for e in elements],
                    start_index=2,  # Start at the last element.
                    end_index=3),
                output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=None,
                               output_coders=[self.OUTPUT_CODER]),
            make_text_sink(output_path,
                           input=(1, 0),
                           coder=coders.Base64PickleCoder())
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
  def test_lambda_with_globals(self):
    """Tests that the globals of a function are preserved."""

    # The point of the test is that the lambda being called after unpickling
    # relies on having the re module being loaded.
    self.assertEquals(
        ['abc', 'def'],
        loads(dumps(module_test.get_lambda_with_globals()))('abc def'))
Exemple #12
0
 def __init__(self, spec, counter_factory):
   super(CombineOperation, self).__init__(spec, counter_factory)
   # Combiners do not accept deferred side-inputs (the ignored fourth argument)
   # and therefore the code to handle the extra args/kwargs is simpler than for
   # the DoFn's of ParDo.
   fn, args, kwargs = pickler.loads(self.spec.serialized_fn)[:3]
   self.phased_combine_fn = (
       PhasedCombineFnExecutor(self.spec.phase, fn, args, kwargs))
 def __init__(self, spec, counter_factory):
     super(CombineOperation, self).__init__(spec, counter_factory)
     # Combiners do not accept deferred side-inputs (the ignored fourth argument)
     # and therefore the code to handle the extra args/kwargs is simpler than for
     # the DoFn's of ParDo.
     fn, args, kwargs = pickler.loads(self.spec.serialized_fn)[:3]
     self.phased_combine_fn = (PhasedCombineFnExecutor(
         self.spec.phase, fn, args, kwargs))
Exemple #14
0
def get_custom_source_read_spec(source_spec):
  source_info = pickler.loads(source_spec['serialized_source']['value'])
  assert isinstance(source_info, tuple)
  assert len(source_info) == 3
  assert isinstance(source_info[0], iobase.BoundedSource)
  return WorkerRead(
      workercustomsources.NativeBoundedSource(source_info[0],
                                              source_info[1],
                                              source_info[2]),
      [source_info[0].default_output_coder()])
Exemple #15
0
 def __init__(self, spec, counter_factory):
   super(PGBKCVOperation, self).__init__(spec, counter_factory)
   # Combiners do not accept deferred side-inputs (the ignored fourth
   # argument) and therefore the code to handle the extra args/kwargs is
   # simpler than for the DoFn's of ParDo.
   fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3]
   self.combine_fn = curry_combine_fn(fn, args, kwargs)
   # Optimization for the (known tiny accumulator, often wide keyspace)
   # count function.
   # TODO(robertwb): Bound by in-memory size rather than key count.
   self.max_keys = (
       1000000 if isinstance(fn, combiners.CountCombineFn) else 10000)
   self.key_count = 0
   self.table = {}
 def __init__(self, spec, counter_factory):
     super(PGBKCVOperation, self).__init__(spec, counter_factory)
     # Combiners do not accept deferred side-inputs (the ignored fourth
     # argument) and therefore the code to handle the extra args/kwargs is
     # simpler than for the DoFn's of ParDo.
     fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3]
     self.combine_fn = curry_combine_fn(fn, args, kwargs)
     # Optimization for the (known tiny accumulator, often wide keyspace)
     # count function.
     # TODO(robertwb): Bound by in-memory size rather than key count.
     self.max_keys = (1000000 if isinstance(fn, combiners.CountCombineFn)
                      else 10000)
     self.key_count = 0
     self.table = {}
  def __init__(self, spec):
    super(CombineOperation, self).__init__(spec)
    # Combiners do not accept deferred side-inputs (the ignored fourth argument)
    # and therefore the code to handle the extra args/kwargs is simpler than for
    # the DoFn's of ParDo.
    fn, args, kwargs = pickler.loads(self.spec.serialized_fn)[:3]

    if not args and not kwargs:
      self.combine_fn = fn
    else:

      class CurriedFn(ptransform.CombineFn):

        def create_accumulator(self):
          return fn.create_accumulator(*args, **kwargs)

        def add_input(self, accumulator, element):
          return fn.add_input(accumulator, element, *args, **kwargs)

        def add_inputs(self, accumulator, elements):
          return fn.add_inputs(accumulator, elements, *args, **kwargs)

        def merge_accumulators(self, accumulators):
          return fn.merge_accumulators(accumulators, *args, **kwargs)

        def extract_output(self, accumulator):
          return fn.extract_output(accumulator, *args, **kwargs)

        def apply(self, elements):
          return fn.apply(elements, *args, **kwargs)

      self.combine_fn = CurriedFn()

    if self.spec.phase == 'all':
      self.apply = self.full_combine
    elif self.spec.phase == 'add':
      self.apply = self.add_only
    elif self.spec.phase == 'merge':
      self.apply = self.merge_only
    elif self.spec.phase == 'extract':
      self.apply = self.extract_only
    else:
      raise ValueError('Unexpected phase: %s' % self.spec.phase)
 def test_create_do_avro_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=2,  # Start at the last element.
               end_index=3),
           tag=None),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
           output_tags=['out'], input=(0, 0), side_inputs=None),
       maptask.WorkerWrite(fileio.TextFileSink(
           file_path_prefix=output_path,
           append_trailing_newlines=True,
           coder=coders.Base64PickleCoder()), input=(1, 0))]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
  def __init__(self, source_operation_split_proto):
    source_spec = {p.key: from_json_value(p.value) for p in
                   source_operation_split_proto.source.spec
                   .additionalProperties}
    if not source_spec.has_key(names.SERIALIZED_SOURCE_KEY):
      raise ValueError(
          'Source split spec must contain a serialized source. Received: %r',
          source_operation_split_proto)
    self.source = pickler.loads(
        source_spec[names.SERIALIZED_SOURCE_KEY]['value'])

    assert self.source is not None
    assert isinstance(self.source, iobase.BoundedSource)

    desired_bundle_size_bytes = (
        source_operation_split_proto.options.desiredBundleSizeBytes)
    if not desired_bundle_size_bytes:
      self.desired_bundle_size_bytes = DEFAULT_DESIRED_BUNDLE_SIZE
    else:
      self.desired_bundle_size_bytes = long(desired_bundle_size_bytes)
  def start(self):
    super(DoOperation, self).start()

    # See fn_data in dataflow_runner.py
    fn, args, kwargs, tags_and_types, window_fn = (
        pickler.loads(self.spec.serialized_fn))

    self.state.step_name = self.step_name

    # TODO(silviuc): What is the proper label here? PCollection being processed?
    self.context = ptransform.DoFnProcessContext('label', state=self.state)
    # Tag to output index map used to dispatch the side output values emitted
    # by the DoFn function to the appropriate receivers. The main output is
    # tagged with None and is associated with its corresponding index.
    tagged_receivers = {}
    tagged_counters = {}
    self._tag_map = {}
    output_tag_prefix = PropertyNames.OUT + '_'
    for index, tag in enumerate(self.spec.output_tags):
      if tag == PropertyNames.OUT:
        original_tag = None
      elif tag.startswith(output_tag_prefix):
        original_tag = tag[len(output_tag_prefix):]
      else:
        raise ValueError('Unexpected output name for operation: %s' % tag)
      # There may be no receiver for this output, in which case the
      # lookup will create one, and this value will be processed
      # for any side effect.  This is desirable.  There are two (known)
      # cases where there is no receiver for an output:
      #  1. ParDo without anything following it, executed for side effect.
      #  2. Partition (shows up here in the worker as Flatten), which
      #     generates a default output that isn't used.
      tagged_receivers[original_tag] = self.receivers[index]
      tagged_counters[original_tag] = self.counters[index]

    self.dofn_runner = common.DoFnRunner(
        fn, args, kwargs, self._read_side_inputs(tags_and_types),
        window_fn, self.context, tagged_receivers, tagged_counters,
        logger, self.step_name)

    self.dofn_runner.start()
Exemple #21
0
    def __init__(self, source_operation_split_proto):
        source_spec = {
            p.key: from_json_value(p.value)
            for p in
            source_operation_split_proto.source.spec.additionalProperties
        }
        if not source_spec.has_key(names.SERIALIZED_SOURCE_KEY):
            raise ValueError(
                'Source split spec must contain a serialized source. Received: %r',
                source_operation_split_proto)
        self.source = pickler.loads(
            source_spec[names.SERIALIZED_SOURCE_KEY]['value'])

        assert self.source is not None
        assert isinstance(self.source, iobase.BoundedSource)

        desired_bundle_size_bytes = (
            source_operation_split_proto.options.desiredBundleSizeBytes)
        if not desired_bundle_size_bytes:
            self.desired_bundle_size_bytes = DEFAULT_DESIRED_BUNDLE_SIZE
        else:
            self.desired_bundle_size_bytes = long(desired_bundle_size_bytes)
 def __init__(self, spec):
   super(BatchGroupAlsoByWindowsOperation, self).__init__(spec)
   self.windowing = pickler.loads(self.spec.window_fn)
 def test_dynamic_class(self):
   """Tests that a nested class object is pickled correctly."""
   self.assertEquals(
       'Z:abc',
       loads(dumps(module_test.create_class('abc'))).get())
 def test_class(self):
   """Tests that a class object is pickled correctly."""
   self.assertEquals(
       ['abc', 'def'],
       loads(dumps(module_test.Xyz))().foo('abc def'))
 def test_object(self):
   """Tests that a class instance is pickled correctly."""
   self.assertEquals(
       ['abc', 'def'],
       loads(dumps(module_test.XYZ_OBJECT)).foo('abc def'))
Exemple #26
0
 def test_get_coder_can_be_pickled(self):
     coder = typecoders.registry.get_coder(typehints.Tuple[str, int])
     revived_coder = pickler.loads(pickler.dumps(coder))
     self.assertEqual(
         ('abc', 123),
         revived_coder.decode(revived_coder.encode(('abc', 123))))
 def test_lambda_with_closure(self):
   """Tests that the closure of a function is preserved."""
   self.assertEquals(
       'closure: abc',
       loads(dumps(module_test.get_lambda_with_closure('abc')))())
 def test_basics(self):
   self.assertEquals([1, 'a', (u'z',)], loads(dumps([1, 'a', (u'z',)])))
   fun = lambda x: 'xyz-%s' % x
   self.assertEquals('xyz-abc', loads(dumps(fun))('abc'))
Exemple #29
0
 def __init__(self, spec, counter_factory):
   super(StreamingGroupAlsoByWindowsOperation, self).__init__(
       spec, counter_factory)
   self.windowing = pickler.loads(self.spec.window_fn)
 def test_get_coder_can_be_pickled(self):
   coder = typecoders.registry.get_coder(typehints.Tuple[str, int])
   revived_coder = pickler.loads(pickler.dumps(coder))
   self.assertEqual(('abc', 123),
                    revived_coder.decode(revived_coder.encode(('abc', 123))))
 def __init__(self, spec, counter_factory):
     super(StreamingGroupAlsoByWindowsOperation,
           self).__init__(spec, counter_factory)
     self.windowing = pickler.loads(self.spec.window_fn)
Exemple #32
0
def deserialize_coder(serialized):
  from google.cloud.dataflow.internal import pickler
  return pickler.loads(serialized.split('$', 1)[1])