def __init__(self, fn_or_label, *args, **kwargs): if fn_or_label is None or isinstance(fn_or_label, basestring): label = fn_or_label fn, args = args[0], args[1:] else: label = None fn = fn_or_label if isinstance(fn, type) and issubclass(fn, typehints.WithTypeHints): # Don't treat Fn class objects as callables. raise ValueError('Use %s() not %s.' % (fn.__name__, fn.__name__)) self.fn = self.make_fn(fn) # Now that we figure out the label, initialize the super-class. super(PTransformWithSideInputs, self).__init__(label=label) if (any([isinstance(v, pvalue.PCollection) for v in args]) or any([isinstance(v, pvalue.PCollection) for v in kwargs.itervalues()])): raise error.SideInputError( 'PCollection used directly as side input argument. Specify ' 'AsIter(pcollection) or AsSingleton(pcollection) to indicate how the ' 'PCollection is to be used.') self.args, self.kwargs, self.side_inputs = util.remove_objects_from_args( args, kwargs, pvalue.PCollectionView) self.raw_side_inputs = args, kwargs # Prevent name collisions with fns of the form '<function <lambda> at ...>' self._cached_fn = self.fn # Ensure fn and side inputs are picklable for remote execution. self.fn = pickler.loads(pickler.dumps(self.fn)) self.args = pickler.loads(pickler.dumps(self.args)) self.kwargs = pickler.loads(pickler.dumps(self.kwargs)) # For type hints, because loads(dumps(class)) != class. self.fn = self._cached_fn
def __init__(self, fn_or_label, *args, **kwargs): if fn_or_label is None or isinstance(fn_or_label, basestring): label = fn_or_label fn, args = args[0], args[1:] else: label = None fn = fn_or_label if isinstance(fn, type) and issubclass(fn, typehints.WithTypeHints): # Don't treat Fn class objects as callables. raise ValueError('Use %s() not %s.' % (fn.__name__, fn.__name__)) self.fn = self.make_fn(fn) # Now that we figure out the label, initialize the super-class. super(PTransformWithSideInputs, self).__init__(label=label) if (any([isinstance(v, pvalue.PCollection) for v in args]) or any( [isinstance(v, pvalue.PCollection) for v in kwargs.itervalues()])): raise error.SideInputError( 'PCollection used directly as side input argument. Specify ' 'AsIter(pcollection) or AsSingleton(pcollection) to indicate how the ' 'PCollection is to be used.') self.args, self.kwargs, self.side_inputs = util.remove_objects_from_args( args, kwargs, pvalue.PCollectionView) self.raw_side_inputs = args, kwargs # Prevent name collisions with fns of the form '<function <lambda> at ...>' self._cached_fn = self.fn # Ensure fn and side inputs are picklable for remote execution. self.fn = pickler.loads(pickler.dumps(self.fn)) self.args = pickler.loads(pickler.dumps(self.args)) self.kwargs = pickler.loads(pickler.dumps(self.kwargs)) # For type hints, because loads(dumps(class)) != class. self.fn = self._cached_fn
def test_nested_class(self): """Tests that a nested class object is pickled correctly.""" self.assertEquals( 'X:abc', loads(dumps(module_test.TopClass.NestedClass('abc'))).datum) self.assertEquals( 'Y:abc', loads(dumps(module_test.TopClass.MiddleClass.NestedClass('abc'))).datum)
def __init__(self, spec): super(BatchGroupAlsoByWindowsOperation, self).__init__(spec) self.windowing = pickler.loads(self.spec.window_fn) if self.spec.combine_fn: # Combiners do not accept deferred side-inputs (the ignored fourth # argument) and therefore the code to handle the extra args/kwargs is # simpler than for the DoFn's of ParDo. fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3] self.phased_combine_fn = ( PhasedCombineFnExecutor(self.spec.phase, fn, args, kwargs)) else: self.phased_combine_fn = None
def __init__(self, spec, counter_factory): super(BatchGroupAlsoByWindowsOperation, self).__init__(spec, counter_factory) self.windowing = pickler.loads(self.spec.window_fn) if self.spec.combine_fn: # Combiners do not accept deferred side-inputs (the ignored fourth # argument) and therefore the code to handle the extra args/kwargs is # simpler than for the DoFn's of ParDo. fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3] self.phased_combine_fn = (PhasedCombineFnExecutor( self.spec.phase, fn, args, kwargs)) else: self.phased_combine_fn = None
def start(self): super(DoOperation, self).start() # See fn_data in dataflow_runner.py fn, args, kwargs, tags_and_types, window_fn = ( pickler.loads(self.spec.serialized_fn)) self.state.step_name = self.step_name # TODO(silviuc): What is the proper label here? PCollection being processed? self.context = ptransform.DoFnProcessContext('label', state=self.state) # Tag to output index map used to dispatch the side output values emitted # by the DoFn function to the appropriate receivers. The main output is # tagged with None and is associated with its corresponding index. tagged_receivers = {} output_tag_prefix = PropertyNames.OUT + '_' for index, tag in enumerate(self.spec.output_tags): if tag == PropertyNames.OUT: original_tag = None elif tag.startswith(output_tag_prefix): original_tag = tag[len(output_tag_prefix):] else: raise ValueError('Unexpected output name for operation: %s' % tag) tagged_receivers[original_tag] = self.receivers[index] self.dofn_runner = common.DoFnRunner( fn, args, kwargs, self._read_side_inputs(tags_and_types), window_fn, self.context, tagged_receivers, logger, self.step_name) self.dofn_runner.start()
def test_get_coder_with_composite_custom_coder(self): typecoders.registry.register_coder(CustomClass, CustomCoder) coder = typecoders.registry.get_coder(typehints.KV[CustomClass, str]) revived_coder = pickler.loads(pickler.dumps(coder)) self.assertEqual( (CustomClass(123), 'abc'), revived_coder.decode(revived_coder.encode((CustomClass(123), 'abc'))))
def test_get_coder_with_composite_custom_coder(self): typecoders.registry.register_coder(CustomClass, CustomCoder) coder = typecoders.registry.get_coder(typehints.KV[CustomClass, str]) revived_coder = pickler.loads(pickler.dumps(coder)) self.assertEqual((CustomClass(123), 'abc'), revived_coder.decode( revived_coder.encode((CustomClass(123), 'abc'))))
def start(self): super(DoOperation, self).start() # See fn_data in dataflow_runner.py fn, args, kwargs, tags_and_types, window_fn = (pickler.loads( self.spec.serialized_fn)) self.state.step_name = self.step_name # TODO(silviuc): What is the proper label here? PCollection being processed? self.context = ptransform.DoFnProcessContext('label', state=self.state) # Tag to output index map used to dispatch the side output values emitted # by the DoFn function to the appropriate receivers. The main output is # tagged with None and is associated with its corresponding index. tagged_receivers = {} output_tag_prefix = PropertyNames.OUT + '_' for index, tag in enumerate(self.spec.output_tags): if tag == PropertyNames.OUT: original_tag = None elif tag.startswith(output_tag_prefix): original_tag = tag[len(output_tag_prefix):] else: raise ValueError('Unexpected output name for operation: %s' % tag) tagged_receivers[original_tag] = self.receivers[index] self.dofn_runner = common.DoFnRunner( fn, args, kwargs, self._read_side_inputs(tags_and_types), window_fn, self.context, tagged_receivers, logger, self.step_name) self.dofn_runner.start()
def test_create_do_avro_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=2, # Start at the last element. end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], input=(0, 0), side_inputs=None, output_coders=[self.OUTPUT_CODER]), make_text_sink(output_path, input=(1, 0), coder=coders.Base64PickleCoder()) ]) executor.MapTaskExecutor(work_item.map_task).execute() with open(output_path) as f: self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
def test_lambda_with_globals(self): """Tests that the globals of a function are preserved.""" # The point of the test is that the lambda being called after unpickling # relies on having the re module being loaded. self.assertEquals( ['abc', 'def'], loads(dumps(module_test.get_lambda_with_globals()))('abc def'))
def __init__(self, spec, counter_factory): super(CombineOperation, self).__init__(spec, counter_factory) # Combiners do not accept deferred side-inputs (the ignored fourth argument) # and therefore the code to handle the extra args/kwargs is simpler than for # the DoFn's of ParDo. fn, args, kwargs = pickler.loads(self.spec.serialized_fn)[:3] self.phased_combine_fn = ( PhasedCombineFnExecutor(self.spec.phase, fn, args, kwargs))
def __init__(self, spec, counter_factory): super(CombineOperation, self).__init__(spec, counter_factory) # Combiners do not accept deferred side-inputs (the ignored fourth argument) # and therefore the code to handle the extra args/kwargs is simpler than for # the DoFn's of ParDo. fn, args, kwargs = pickler.loads(self.spec.serialized_fn)[:3] self.phased_combine_fn = (PhasedCombineFnExecutor( self.spec.phase, fn, args, kwargs))
def get_custom_source_read_spec(source_spec): source_info = pickler.loads(source_spec['serialized_source']['value']) assert isinstance(source_info, tuple) assert len(source_info) == 3 assert isinstance(source_info[0], iobase.BoundedSource) return WorkerRead( workercustomsources.NativeBoundedSource(source_info[0], source_info[1], source_info[2]), [source_info[0].default_output_coder()])
def __init__(self, spec, counter_factory): super(PGBKCVOperation, self).__init__(spec, counter_factory) # Combiners do not accept deferred side-inputs (the ignored fourth # argument) and therefore the code to handle the extra args/kwargs is # simpler than for the DoFn's of ParDo. fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3] self.combine_fn = curry_combine_fn(fn, args, kwargs) # Optimization for the (known tiny accumulator, often wide keyspace) # count function. # TODO(robertwb): Bound by in-memory size rather than key count. self.max_keys = ( 1000000 if isinstance(fn, combiners.CountCombineFn) else 10000) self.key_count = 0 self.table = {}
def __init__(self, spec, counter_factory): super(PGBKCVOperation, self).__init__(spec, counter_factory) # Combiners do not accept deferred side-inputs (the ignored fourth # argument) and therefore the code to handle the extra args/kwargs is # simpler than for the DoFn's of ParDo. fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3] self.combine_fn = curry_combine_fn(fn, args, kwargs) # Optimization for the (known tiny accumulator, often wide keyspace) # count function. # TODO(robertwb): Bound by in-memory size rather than key count. self.max_keys = (1000000 if isinstance(fn, combiners.CountCombineFn) else 10000) self.key_count = 0 self.table = {}
def __init__(self, spec): super(CombineOperation, self).__init__(spec) # Combiners do not accept deferred side-inputs (the ignored fourth argument) # and therefore the code to handle the extra args/kwargs is simpler than for # the DoFn's of ParDo. fn, args, kwargs = pickler.loads(self.spec.serialized_fn)[:3] if not args and not kwargs: self.combine_fn = fn else: class CurriedFn(ptransform.CombineFn): def create_accumulator(self): return fn.create_accumulator(*args, **kwargs) def add_input(self, accumulator, element): return fn.add_input(accumulator, element, *args, **kwargs) def add_inputs(self, accumulator, elements): return fn.add_inputs(accumulator, elements, *args, **kwargs) def merge_accumulators(self, accumulators): return fn.merge_accumulators(accumulators, *args, **kwargs) def extract_output(self, accumulator): return fn.extract_output(accumulator, *args, **kwargs) def apply(self, elements): return fn.apply(elements, *args, **kwargs) self.combine_fn = CurriedFn() if self.spec.phase == 'all': self.apply = self.full_combine elif self.spec.phase == 'add': self.apply = self.add_only elif self.spec.phase == 'merge': self.apply = self.merge_only elif self.spec.phase == 'extract': self.apply = self.extract_only else: raise ValueError('Unexpected phase: %s' % self.spec.phase)
def test_create_do_avro_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=2, # Start at the last element. end_index=3), tag=None), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], input=(0, 0), side_inputs=None), maptask.WorkerWrite(fileio.TextFileSink( file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.Base64PickleCoder()), input=(1, 0))])) with open(output_path) as f: self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
def __init__(self, source_operation_split_proto): source_spec = {p.key: from_json_value(p.value) for p in source_operation_split_proto.source.spec .additionalProperties} if not source_spec.has_key(names.SERIALIZED_SOURCE_KEY): raise ValueError( 'Source split spec must contain a serialized source. Received: %r', source_operation_split_proto) self.source = pickler.loads( source_spec[names.SERIALIZED_SOURCE_KEY]['value']) assert self.source is not None assert isinstance(self.source, iobase.BoundedSource) desired_bundle_size_bytes = ( source_operation_split_proto.options.desiredBundleSizeBytes) if not desired_bundle_size_bytes: self.desired_bundle_size_bytes = DEFAULT_DESIRED_BUNDLE_SIZE else: self.desired_bundle_size_bytes = long(desired_bundle_size_bytes)
def start(self): super(DoOperation, self).start() # See fn_data in dataflow_runner.py fn, args, kwargs, tags_and_types, window_fn = ( pickler.loads(self.spec.serialized_fn)) self.state.step_name = self.step_name # TODO(silviuc): What is the proper label here? PCollection being processed? self.context = ptransform.DoFnProcessContext('label', state=self.state) # Tag to output index map used to dispatch the side output values emitted # by the DoFn function to the appropriate receivers. The main output is # tagged with None and is associated with its corresponding index. tagged_receivers = {} tagged_counters = {} self._tag_map = {} output_tag_prefix = PropertyNames.OUT + '_' for index, tag in enumerate(self.spec.output_tags): if tag == PropertyNames.OUT: original_tag = None elif tag.startswith(output_tag_prefix): original_tag = tag[len(output_tag_prefix):] else: raise ValueError('Unexpected output name for operation: %s' % tag) # There may be no receiver for this output, in which case the # lookup will create one, and this value will be processed # for any side effect. This is desirable. There are two (known) # cases where there is no receiver for an output: # 1. ParDo without anything following it, executed for side effect. # 2. Partition (shows up here in the worker as Flatten), which # generates a default output that isn't used. tagged_receivers[original_tag] = self.receivers[index] tagged_counters[original_tag] = self.counters[index] self.dofn_runner = common.DoFnRunner( fn, args, kwargs, self._read_side_inputs(tags_and_types), window_fn, self.context, tagged_receivers, tagged_counters, logger, self.step_name) self.dofn_runner.start()
def __init__(self, source_operation_split_proto): source_spec = { p.key: from_json_value(p.value) for p in source_operation_split_proto.source.spec.additionalProperties } if not source_spec.has_key(names.SERIALIZED_SOURCE_KEY): raise ValueError( 'Source split spec must contain a serialized source. Received: %r', source_operation_split_proto) self.source = pickler.loads( source_spec[names.SERIALIZED_SOURCE_KEY]['value']) assert self.source is not None assert isinstance(self.source, iobase.BoundedSource) desired_bundle_size_bytes = ( source_operation_split_proto.options.desiredBundleSizeBytes) if not desired_bundle_size_bytes: self.desired_bundle_size_bytes = DEFAULT_DESIRED_BUNDLE_SIZE else: self.desired_bundle_size_bytes = long(desired_bundle_size_bytes)
def __init__(self, spec): super(BatchGroupAlsoByWindowsOperation, self).__init__(spec) self.windowing = pickler.loads(self.spec.window_fn)
def test_dynamic_class(self): """Tests that a nested class object is pickled correctly.""" self.assertEquals( 'Z:abc', loads(dumps(module_test.create_class('abc'))).get())
def test_class(self): """Tests that a class object is pickled correctly.""" self.assertEquals( ['abc', 'def'], loads(dumps(module_test.Xyz))().foo('abc def'))
def test_object(self): """Tests that a class instance is pickled correctly.""" self.assertEquals( ['abc', 'def'], loads(dumps(module_test.XYZ_OBJECT)).foo('abc def'))
def test_get_coder_can_be_pickled(self): coder = typecoders.registry.get_coder(typehints.Tuple[str, int]) revived_coder = pickler.loads(pickler.dumps(coder)) self.assertEqual( ('abc', 123), revived_coder.decode(revived_coder.encode(('abc', 123))))
def test_lambda_with_closure(self): """Tests that the closure of a function is preserved.""" self.assertEquals( 'closure: abc', loads(dumps(module_test.get_lambda_with_closure('abc')))())
def test_basics(self): self.assertEquals([1, 'a', (u'z',)], loads(dumps([1, 'a', (u'z',)]))) fun = lambda x: 'xyz-%s' % x self.assertEquals('xyz-abc', loads(dumps(fun))('abc'))
def __init__(self, spec, counter_factory): super(StreamingGroupAlsoByWindowsOperation, self).__init__( spec, counter_factory) self.windowing = pickler.loads(self.spec.window_fn)
def test_get_coder_can_be_pickled(self): coder = typecoders.registry.get_coder(typehints.Tuple[str, int]) revived_coder = pickler.loads(pickler.dumps(coder)) self.assertEqual(('abc', 123), revived_coder.decode(revived_coder.encode(('abc', 123))))
def __init__(self, spec, counter_factory): super(StreamingGroupAlsoByWindowsOperation, self).__init__(spec, counter_factory) self.windowing = pickler.loads(self.spec.window_fn)
def deserialize_coder(serialized): from google.cloud.dataflow.internal import pickler return pickler.loads(serialized.split('$', 1)[1])