def __init__(self, fn_or_label, *args, **kwargs): if fn_or_label is None or isinstance(fn_or_label, basestring): label = fn_or_label fn, args = args[0], args[1:] else: label = None fn = fn_or_label if isinstance(fn, type) and issubclass(fn, typehints.WithTypeHints): # Don't treat Fn class objects as callables. raise ValueError('Use %s() not %s.' % (fn.__name__, fn.__name__)) self.fn = self.make_fn(fn) # Now that we figure out the label, initialize the super-class. super(PTransformWithSideInputs, self).__init__(label=label) if (any([isinstance(v, pvalue.PCollection) for v in args]) or any( [isinstance(v, pvalue.PCollection) for v in kwargs.itervalues()])): raise error.SideInputError( 'PCollection used directly as side input argument. Specify ' 'AsIter(pcollection) or AsSingleton(pcollection) to indicate how the ' 'PCollection is to be used.') self.args, self.kwargs, self.side_inputs = util.remove_objects_from_args( args, kwargs, pvalue.PCollectionView) self.raw_side_inputs = args, kwargs # Prevent name collisions with fns of the form '<function <lambda> at ...>' self._cached_fn = self.fn # Ensure fn and side inputs are picklable for remote execution. self.fn = pickler.loads(pickler.dumps(self.fn)) self.args = pickler.loads(pickler.dumps(self.args)) self.kwargs = pickler.loads(pickler.dumps(self.kwargs)) # For type hints, because loads(dumps(class)) != class. self.fn = self._cached_fn
def __init__(self, fn_or_label, *args, **kwargs): if fn_or_label is None or isinstance(fn_or_label, basestring): label = fn_or_label fn, args = args[0], args[1:] else: label = None fn = fn_or_label if isinstance(fn, type) and issubclass(fn, typehints.WithTypeHints): # Don't treat Fn class objects as callables. raise ValueError('Use %s() not %s.' % (fn.__name__, fn.__name__)) self.fn = self.make_fn(fn) # Now that we figure out the label, initialize the super-class. super(PTransformWithSideInputs, self).__init__(label=label) if (any([isinstance(v, pvalue.PCollection) for v in args]) or any([isinstance(v, pvalue.PCollection) for v in kwargs.itervalues()])): raise error.SideInputError( 'PCollection used directly as side input argument. Specify ' 'AsIter(pcollection) or AsSingleton(pcollection) to indicate how the ' 'PCollection is to be used.') self.args, self.kwargs, self.side_inputs = util.remove_objects_from_args( args, kwargs, pvalue.PCollectionView) self.raw_side_inputs = args, kwargs # Prevent name collisions with fns of the form '<function <lambda> at ...>' self._cached_fn = self.fn # Ensure fn and side inputs are picklable for remote execution. self.fn = pickler.loads(pickler.dumps(self.fn)) self.args = pickler.loads(pickler.dumps(self.args)) self.kwargs = pickler.loads(pickler.dumps(self.kwargs)) # For type hints, because loads(dumps(class)) != class. self.fn = self._cached_fn
def test_create_do_with_side_in_memory_write(self): elements = ['abc', 'def', 'ghi'] side_elements = ['x', 'y', 'z'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, side)]), tag_and_type=('inmemory', pvalue.SingletonPCollectionView, (False, None))), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in side_elements], start_index=None, end_index=None), tag='inmemory')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as singleton therefore we should see # only the first element appended. self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
def test_nested_class(self): """Tests that a nested class object is pickled correctly.""" self.assertEquals( 'X:abc', loads(dumps(module_test.TopClass.NestedClass('abc'))).datum) self.assertEquals( 'Y:abc', loads(dumps(module_test.TopClass.MiddleClass.NestedClass('abc'))).datum)
def test_get_coder_with_composite_custom_coder(self): typecoders.registry.register_coder(CustomClass, CustomCoder) coder = typecoders.registry.get_coder(typehints.KV[CustomClass, str]) revived_coder = pickler.loads(pickler.dumps(coder)) self.assertEqual( (CustomClass(123), 'abc'), revived_coder.decode(revived_coder.encode((CustomClass(123), 'abc'))))
def test_get_coder_with_composite_custom_coder(self): typecoders.registry.register_coder(CustomClass, CustomCoder) coder = typecoders.registry.get_coder(typehints.KV[CustomClass, str]) revived_coder = pickler.loads(pickler.dumps(coder)) self.assertEqual((CustomClass(123), 'abc'), revived_coder.decode( revived_coder.encode((CustomClass(123), 'abc'))))
def run_GroupByKey(self, transform_node): input_tag = transform_node.inputs[0].tag input_step = self._cache.get_pvalue(transform_node.inputs[0]) step = self._add_step(TransformNames.GROUP, transform_node.full_label, transform_node) step.add_property( PropertyNames.PARALLEL_INPUT, { '@type': 'OutputReference', PropertyNames.STEP_NAME: input_step.proto.name, PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag) }) step.encoding = self._get_typehint_based_encoding( self._get_transform_type_hint(transform_node)) step.add_property(PropertyNames.OUTPUT_INFO, [{ PropertyNames.USER_NAME: ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT }]) windowing = transform_node.transform.get_windowing( transform_node.inputs) step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(windowing))
def test_create_do_avro_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=2, # Start at the last element. end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], input=(0, 0), side_inputs=None, output_coders=[self.OUTPUT_CODER]), make_text_sink(output_path, input=(1, 0), coder=coders.Base64PickleCoder()) ]) executor.MapTaskExecutor(work_item.map_task).execute() with open(output_path) as f: self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
def test_create_do_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], # Start at the last element. start_index=2, # Go beyond the end to test that case is handled. end_index=15), output_coders=[coders.ToStringCoder()]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: ghi\n', f.read())
def test_create_do_with_side_text_file_write(self): input_path = self.create_temp_file('x\ny\n') elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('textfile', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path, start_offset=None, end_offset=None, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), tag='textfile')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def run_CombineValues(self, transform_node): transform = transform_node.transform input_tag = transform_node.inputs[0].tag input_step = self._cache.get_pvalue(transform_node.inputs[0]) step = self._add_step( TransformNames.COMBINE, transform_node.full_label, transform_node) # Combiner functions do not take deferred side-inputs (i.e. PValues) and # therefore the code to handle extra args/kwargs is simpler than for the # DoFn's of the ParDo transform. In the last, empty argument is where # side inputs information would go. fn_data = (transform.fn, transform.args, transform.kwargs, ()) step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(fn_data)) step.add_property( PropertyNames.PARALLEL_INPUT, {'@type': 'OutputReference', PropertyNames.STEP_NAME: input_step.proto.name, PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)}) # Note that the accumulator must not have a WindowedValue encoding, while # the output of this step does in fact have a WindowedValue encoding. accumulator_encoding = self._get_encoded_output_coder(transform_node, window_value=False) output_encoding = self._get_encoded_output_coder(transform_node) step.encoding = output_encoding step.add_property(PropertyNames.ENCODING, accumulator_encoding) # Generate description for main output 'out.' outputs = [] # Add the main output to the description. outputs.append( {PropertyNames.USER_NAME: ( '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT}) step.add_property(PropertyNames.OUTPUT_INFO, outputs)
def test_lambda_with_globals(self): """Tests that the globals of a function are preserved.""" # The point of the test is that the lambda being called after unpickling # relies on having the re module being loaded. self.assertEquals( ['abc', 'def'], loads(dumps(module_test.get_lambda_with_globals()))('abc def'))
def pickle_with_side_inputs(fn, tag_and_type=None): tags_and_types = [] args = [] if tag_and_type is not None: args.append(util.ArgumentPlaceholder()) tags_and_types.append(tag_and_type) return pickler.dumps( (fn, args, {}, tags_and_types, core.Windowing(window.GlobalWindows())))
def pickle_with_side_inputs(fn, tag_and_type=None): tags_and_types = [] args = [] if tag_and_type is not None: args.append(util.ArgumentPlaceholder()) tags_and_types.append(tag_and_type) return pickler.dumps((fn, args, {}, tags_and_types, core.Windowing(window.GlobalWindows())))
def test_create_do_with_side_avro_file_write(self): input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x')) input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y')) elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('sometag', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), # Note that the two side inputs have the same tag. This is quite # common for intermediary PCollections used as side inputs that # are saved as AVRO files. The files will contain the sharded # PCollection. side_inputs=[ maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path1, coder=coders.Base64PickleCoder()), tag='sometag'), maptask.WorkerSideInputSource(fileio.TextFileSource( file_path=input_path2, coder=coders.Base64PickleCoder()), tag='sometag') ], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER, )) ])) # The side source was specified as collection therefore we should see # all three elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def test_create_do_with_side_avro_file_write(self): input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x')) input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y')) elements = ['aa', 'bb'] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=2), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=( 'sometag', False)), # False => type is collection. output_tags=['out'], input=(0, 0), # Note that the two side inputs have the same tag. This is quite # common for intermediary PCollections used as side inputs that # are saved as AVRO files. The files will contain the sharded # PCollection. side_inputs=[ maptask.WorkerSideInputSource( fileio.TextFileSource( file_path=input_path1, coder=coders.Base64PickleCoder()), tag='sometag'), maptask.WorkerSideInputSource( fileio.TextFileSource(file_path=input_path2, coder=coders.Base64PickleCoder()), tag='sometag')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as collection therefore we should see # all three elements of the side source. self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'], sorted(output_buffer))
def test_pgbk(self): elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements ], start_index=0, end_index=100), tag=None), maptask.WorkerPartialGroupByKey(input=( 0, 0)), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0)) ])) self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
def test_create_do_with_collection_side_bigquery_write(self): elements = ['aa', 'bb'] side_elements = ['x', 'y'] output_buffer = [] patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader' with mock.patch(target=patch_target) as mock_class: # Setup the reader so it will yield the values in 'side_elements'. reader_mock = mock_class.return_value reader_mock.__enter__.return_value = reader_mock # Use a lambda so that multiple readers can be created, each reading the # entirety of the side elements. reader_mock.__iter__.side_effect = lambda: (x for x in side_elements) executor.MapTaskExecutor().execute( make_map_task([ maptask.WorkerRead(inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask. WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('bigquery', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( bigquery.BigQuerySource( project='project', dataset='dataset', table='table', coder=get_bigquery_source_coder()), tag='bigquery') ], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER, )) ])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'], sorted(output_buffer))
def test_in_memory_source_progress_reporting(self): elements = [101, 201, 301, 401, 501, 601, 701] output_buffer = [] source = ProgressRequestRecordingInMemorySource( elements=[pickler.dumps(e) for e in elements]) work_item = workitem.BatchWorkItem(None) work_item.map_task = make_map_task([ maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(0, 0), output_coders=(self.OUTPUT_CODER, )) ]) executor.MapTaskExecutor(work_item.map_task).execute() self.assertEqual(elements, output_buffer) expected_progress_record = range(len(elements)) self.assertEqual(expected_progress_record, source.last_reader.progress_record)
def test_combine(self): elements = [('a', [1, 2, 3]), ('b', [10])] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=100), tag=None), maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs( ptransform.CombineFn.from_callable(sum)), phase='all', input=(0, 0)), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0)) ])) self.assertEqual([('a', 6), ('b', 10)], output_buffer)
def test_create_do_with_collection_side_bigquery_write(self): elements = ['aa', 'bb'] side_elements = ['x', 'y'] output_buffer = [] patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader' with mock.patch(target=patch_target) as mock_class: # Setup the reader so it will yield the values in 'side_elements'. reader_mock = mock_class.return_value reader_mock.__enter__.return_value = reader_mock # Use a lambda so that multiple readers can be created, each reading the # entirety of the side elements. reader_mock.__iter__.side_effect = lambda: (x for x in side_elements) executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, s) for s in side]), tag_and_type=('bigquery', pvalue.IterablePCollectionView, ())), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( bigquery.BigQuerySource( project='project', dataset='dataset', table='table', coder=get_bigquery_source_coder()), tag='bigquery')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as collection therefore we should see # all elements of the side source. self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'], sorted(output_buffer))
def test_pgbk(self): elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements ], start_index=0, end_index=100), output_coders=[self.OUTPUT_CODER]), maptask.WorkerPartialGroupByKey( combine_fn=None, input=(0, 0), output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,)) ])) self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
def test_create_do_avro_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=2, # Start at the last element. end_index=3), tag=None), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], input=(0, 0), side_inputs=None), maptask.WorkerWrite(fileio.TextFileSink( file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.Base64PickleCoder()), input=(1, 0))])) with open(output_path) as f: self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
def test_combine(self): elements = [('a', [1, 2, 3]), ('b', [10])] output_buffer = [] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], start_index=0, end_index=100), output_coders=[self.OUTPUT_CODER]), maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs( ptransform.CombineFn.from_callable(sum)), phase='all', input=(0, 0), output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,)) ])) self.assertEqual([('a', 6), ('b', 10)], output_buffer)
def test_in_memory_source_progress_reporting(self): elements = [101, 201, 301, 401, 501, 601, 701] output_buffer = [] source = ProgressRequestRecordingInMemorySource( elements=[pickler.dumps(e) for e in elements]) executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite(output_buffer=output_buffer, input=(0, 0), output_coders=(self.OUTPUT_CODER,)) ])) self.assertEqual(elements, output_buffer) expected_progress_record = [] len_elements = len(elements) for i in range(len_elements): expected_progress_record.append(float(i + 1) / len_elements) self.assertEqual(expected_progress_record, source.last_reader.progress_record)
def run_GroupByKey(self, transform_node): input_tag = transform_node.inputs[0].tag input_step = self._cache.get_pvalue(transform_node.inputs[0]) step = self._add_step( TransformNames.GROUP, transform_node.full_label, transform_node) step.add_property( PropertyNames.PARALLEL_INPUT, {'@type': 'OutputReference', PropertyNames.STEP_NAME: input_step.proto.name, PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)}) step.encoding = self._get_encoded_output_coder(transform_node) step.add_property( PropertyNames.OUTPUT_INFO, [{PropertyNames.USER_NAME: ( '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT}]) windowing = transform_node.transform.get_windowing( transform_node.inputs) step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(windowing))
def test_create_do_with_singleton_side_bigquery_write(self): elements = ['abc', 'def', 'ghi'] side_elements = ['x', 'y', 'z'] output_buffer = [] patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader' with mock.patch(target=patch_target) as mock_class: # Setup the reader so it will yield the values in 'side_elements'. reader_mock = mock_class.return_value reader_mock.__enter__.return_value = reader_mock reader_mock.__iter__.return_value = (x for x in side_elements) pickled_elements = [pickler.dumps(e) for e in elements] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource(elements=pickled_elements, start_index=0, end_index=3), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn( serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda x, side: ['%s:%s' % (x, side)]), tag_and_type=('bigquery', pvalue.SingletonPCollectionView, (False, None))), output_tags=['out'], input=(0, 0), side_inputs=[ maptask.WorkerSideInputSource( bigquery.BigQuerySource( project='project', dataset='dataset', table='table', coder=get_bigquery_source_coder()), tag='bigquery')], output_coders=[self.OUTPUT_CODER]), maptask.WorkerInMemoryWrite( output_buffer=output_buffer, input=(1, 0), output_coders=(self.OUTPUT_CODER,))])) # The side source was specified as singleton therefore we should see # only the first element appended. self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
def splits_to_split_response(bundles): """Generates a response to a custom source split request. Args: bundles: a set of bundles generated by a BoundedSource.split() invocation. Returns: a SourceOperationResponse object. """ derived_sources = [] for bundle in bundles: derived_source = dataflow.DerivedSource() derived_source.derivationMode = ( dataflow.DerivedSource.DerivationModeValueValuesEnum. SOURCE_DERIVATION_MODE_INDEPENDENT) derived_source.source = dataflow.Source() derived_source.source.doesNotNeedSplitting = True derived_source.source.spec = dataflow.Source.SpecValue() derived_source.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=names.SERIALIZED_SOURCE_KEY, value=to_json_value(pickler.dumps( (bundle.source, bundle.start_position, bundle.stop_position)), with_type=True))) derived_source.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key='@type', value=to_json_value(names.SOURCE_TYPE))) derived_sources.append(derived_source) split_response = dataflow.SourceSplitResponse() split_response.bundles = derived_sources split_response.outcome = ( dataflow.SourceSplitResponse.OutcomeValueValuesEnum. SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED) response = dataflow.SourceOperationResponse() response.split = split_response return response
def splits_to_split_response(bundles): """Generates a response to a custom source split request. Args: bundles: a set of bundles generated by a BoundedSource.split() invocation. Returns: a SourceOperationResponse object. """ derived_sources = [] for bundle in bundles: derived_source = dataflow.DerivedSource() derived_source.derivationMode = ( dataflow.DerivedSource.DerivationModeValueValuesEnum .SOURCE_DERIVATION_MODE_INDEPENDENT) derived_source.source = dataflow.Source() derived_source.source.doesNotNeedSplitting = True derived_source.source.spec = dataflow.Source.SpecValue() derived_source.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=names.SERIALIZED_SOURCE_KEY, value=to_json_value(pickler.dumps( (bundle.source, bundle.start_position, bundle.stop_position)), with_type=True))) derived_source.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty(key='@type', value=to_json_value( names.SOURCE_TYPE))) derived_sources.append(derived_source) split_response = dataflow.SourceSplitResponse() split_response.bundles = derived_sources split_response.outcome = ( dataflow.SourceSplitResponse.OutcomeValueValuesEnum .SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED) response = dataflow.SourceOperationResponse() response.split = split_response return response
def run_CombineValues(self, transform_node): transform = transform_node.transform input_tag = transform_node.inputs[0].tag input_step = self._cache.get_pvalue(transform_node.inputs[0]) step = self._add_step(TransformNames.COMBINE, transform_node.full_label, transform_node) # Combiner functions do not take deferred side-inputs (i.e. PValues) and # therefore the code to handle extra args/kwargs is simpler than for the # DoFn's of the ParDo transform. In the last, empty argument is where # side inputs information would go. fn_data = (transform.fn, transform.args, transform.kwargs, ()) step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(fn_data)) step.add_property( PropertyNames.PARALLEL_INPUT, { '@type': 'OutputReference', PropertyNames.STEP_NAME: input_step.proto.name, PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag) }) # Note that the accumulator must not have a WindowedValue encoding, while # the output of this step does in fact have a WindowedValue encoding. accumulator_encoding = self._get_typehint_based_encoding( self._get_transform_type_hint(transform_node), window_value=False) output_encoding = self._get_typehint_based_encoding( self._get_transform_type_hint(transform_node)) step.encoding = output_encoding step.add_property(PropertyNames.ENCODING, accumulator_encoding) # Generate description for main output 'out.' outputs = [] # Add the main output to the description. outputs.append({ PropertyNames.USER_NAME: ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT }) step.add_property(PropertyNames.OUTPUT_INFO, outputs)
def build_split_proto(self, bounded_source, desired_bundle_size): split_proto = dataflow.SourceSplitRequest() split_proto.options = dataflow.SourceSplitOptions() split_proto.options.desiredBundleSizeBytes = desired_bundle_size source = dataflow.Source() spec = dataflow.Source.SpecValue() if bounded_source: spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=names.SERIALIZED_SOURCE_KEY, value=to_json_value({'value': pickler.dumps(bounded_source), '@type': 'http://schema.org/Text'}))) spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key='@type', value=to_json_value('CustomSourcesType'))) source.spec = spec split_proto.source = source return split_proto
def test_dynamic_class(self): """Tests that a nested class object is pickled correctly.""" self.assertEquals( 'Z:abc', loads(dumps(module_test.create_class('abc'))).get())
def test_generators(self): with self.assertRaises(TypeError): dumps((_ for _ in range(10)))
def test_object(self): """Tests that a class instance is pickled correctly.""" self.assertEquals( ['abc', 'def'], loads(dumps(module_test.XYZ_OBJECT)).foo('abc def'))
def test_get_coder_can_be_pickled(self): coder = typecoders.registry.get_coder(typehints.Tuple[str, int]) revived_coder = pickler.loads(pickler.dumps(coder)) self.assertEqual( ('abc', 123), revived_coder.decode(revived_coder.encode(('abc', 123))))
def test_lambda_with_closure(self): """Tests that the closure of a function is preserved.""" self.assertEquals( 'closure: abc', loads(dumps(module_test.get_lambda_with_closure('abc')))())
def test_class(self): """Tests that a class object is pickled correctly.""" self.assertEquals( ['abc', 'def'], loads(dumps(module_test.Xyz))().foo('abc def'))
def test_basics(self): self.assertEquals([1, 'a', (u'z',)], loads(dumps([1, 'a', (u'z',)]))) fun = lambda x: 'xyz-%s' % x self.assertEquals('xyz-abc', loads(dumps(fun))('abc'))
def test_get_coder_can_be_pickled(self): coder = typecoders.registry.get_coder(typehints.Tuple[str, int]) revived_coder = pickler.loads(pickler.dumps(coder)) self.assertEqual(('abc', 123), revived_coder.decode(revived_coder.encode(('abc', 123))))
def run_Read(self, transform_node): transform = transform_node.transform step = self._add_step( TransformNames.READ, transform_node.full_label, transform_node) # TODO(mairbek): refactor if-else tree to use registerable functions. # Initialize the source specific properties. if not hasattr(transform.source, 'format'): # If a format is not set, we assume the source to be a custom source. source_dict = dict() spec_dict = dict() spec_dict[names.SERIALIZED_SOURCE_KEY] = pickler.dumps(transform.source) spec_dict['@type'] = names.SOURCE_TYPE source_dict['spec'] = spec_dict step.add_property(PropertyNames.SOURCE_STEP_INPUT, source_dict) elif transform.source.format == 'text': step.add_property(PropertyNames.FILE_PATTERN, transform.source.path) elif transform.source.format == 'bigquery': # TODO(silviuc): Add table validation if transform.source.validate. if transform.source.table_reference is not None: step.add_property(PropertyNames.BIGQUERY_DATASET, transform.source.table_reference.datasetId) step.add_property(PropertyNames.BIGQUERY_TABLE, transform.source.table_reference.tableId) # If project owning the table was not specified then the project owning # the workflow (current project) will be used. if transform.source.table_reference.projectId is not None: step.add_property(PropertyNames.BIGQUERY_PROJECT, transform.source.table_reference.projectId) elif transform.source.query is not None: step.add_property(PropertyNames.BIGQUERY_QUERY, transform.source.query) else: raise ValueError('BigQuery source %r must specify either a table or' ' a query', transform.source) elif transform.source.format == 'pubsub': standard_options = ( transform_node.inputs[0].pipeline.options.view_as(StandardOptions)) if not standard_options.streaming: raise ValueError('PubSubSource is currently available for use only in ' 'streaming pipelines.') step.add_property(PropertyNames.PUBSUB_TOPIC, transform.source.topic) if transform.source.subscription: step.add_property(PropertyNames.PUBSUB_SUBSCRIPTION, transform.source.topic) if transform.source.id_label: step.add_property(PropertyNames.PUBSUB_ID_LABEL, transform.source.id_label) else: raise ValueError( 'Source %r has unexpected format %s.' % ( transform.source, transform.source.format)) if not hasattr(transform.source, 'format'): step.add_property(PropertyNames.FORMAT, names.SOURCE_FORMAT) else: step.add_property(PropertyNames.FORMAT, transform.source.format) if isinstance(transform.source, iobase.BoundedSource): coder = transform.source.default_output_coder() else: coder = transform.source.coder step.encoding = self._get_cloud_encoding(coder) step.add_property( PropertyNames.OUTPUT_INFO, [{PropertyNames.USER_NAME: ( '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT}])
def run_ParDo(self, transform_node): transform = transform_node.transform input_tag = transform_node.inputs[0].tag input_step = self._cache.get_pvalue(transform_node.inputs[0]) # Attach side inputs. si_dict = {} si_tags_and_types = [] for side_pval in transform_node.side_inputs: assert isinstance(side_pval, PCollectionView) side_input_step = self._cache.get_pvalue(side_pval) si_label = side_input_step.step_name si_dict[si_label] = { '@type': 'OutputReference', PropertyNames.STEP_NAME: si_label, PropertyNames.OUTPUT_NAME: PropertyNames.OUT} # The label for the side input step will appear as a 'tag' property for # the side input source specification. Its type (singleton or iterator) # will also be used to read the entire source or just first element. si_tags_and_types.append((si_label, side_pval.__class__, side_pval._view_options())) # pylint: disable=protected-access # Now create the step for the ParDo transform being handled. step = self._add_step( TransformNames.DO, transform_node.full_label, transform_node, transform_node.transform.side_output_tags) fn_data = (transform.fn, transform.args, transform.kwargs, si_tags_and_types, transform_node.inputs[0].windowing) step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(fn_data)) step.add_property( PropertyNames.PARALLEL_INPUT, {'@type': 'OutputReference', PropertyNames.STEP_NAME: input_step.proto.name, PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag)}) # Add side inputs if any. step.add_property(PropertyNames.NON_PARALLEL_INPUTS, si_dict) # Generate description for main output and side outputs. The output names # will be 'out' for main output and 'out_<tag>' for a tagged output. # Using 'out' as a tag will not clash with the name for main since it will # be transformed into 'out_out' internally. outputs = [] step.encoding = self._get_encoded_output_coder(transform_node) # Add the main output to the description. outputs.append( {PropertyNames.USER_NAME: ( '%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT}) for side_tag in transform.side_output_tags: # The assumption here is that side outputs will have the same typehint # and coder as the main output. This is certainly the case right now # but conceivably it could change in the future. outputs.append( {PropertyNames.USER_NAME: ( '%s.%s' % (transform_node.full_label, side_tag)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: ( '%s_%s' % (PropertyNames.OUT, side_tag))}) step.add_property(PropertyNames.OUTPUT_INFO, outputs)
def run_ParDo(self, transform_node): transform = transform_node.transform input_tag = transform_node.inputs[0].tag input_step = self._cache.get_pvalue(transform_node.inputs[0]) # Attach side inputs. si_dict = {} si_tags_and_types = [] for side_pval in transform_node.side_inputs: assert isinstance(side_pval, PCollectionView) side_input_step = self._cache.get_pvalue(side_pval) si_label = side_input_step.step_name si_dict[si_label] = { '@type': 'OutputReference', PropertyNames.STEP_NAME: si_label, PropertyNames.OUTPUT_NAME: PropertyNames.OUT } # The label for the side input step will appear as a 'tag' property for # the side input source specification. Its type (singleton or iterator) # will also be used to read the entire source or just first element. si_tags_and_types.append( (si_label, side_pval.__class__, side_pval._view_options())) # pylint: disable=protected-access # Now create the step for the ParDo transform being handled. step = self._add_step(TransformNames.DO, transform_node.full_label, transform_node, transform_node.transform.side_output_tags) fn_data = (transform.fn, transform.args, transform.kwargs, si_tags_and_types, transform_node.inputs[0].windowing) step.add_property(PropertyNames.SERIALIZED_FN, pickler.dumps(fn_data)) step.add_property( PropertyNames.PARALLEL_INPUT, { '@type': 'OutputReference', PropertyNames.STEP_NAME: input_step.proto.name, PropertyNames.OUTPUT_NAME: input_step.get_output(input_tag) }) # Add side inputs if any. step.add_property(PropertyNames.NON_PARALLEL_INPUTS, si_dict) # Generate description for main output and side outputs. The output names # will be 'out' for main output and 'out_<tag>' for a tagged output. # Using 'out' as a tag will not clash with the name for main since it will # be transformed into 'out_out' internally. outputs = [] step.encoding = self._get_encoded_output_coder(transform_node) # Add the main output to the description. outputs.append({ PropertyNames.USER_NAME: ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT }) for side_tag in transform.side_output_tags: # The assumption here is that side outputs will have the same typehint # and coder as the main output. This is certainly the case right now # but conceivably it could change in the future. outputs.append({ PropertyNames.USER_NAME: ('%s.%s' % (transform_node.full_label, side_tag)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: ('%s_%s' % (PropertyNames.OUT, side_tag)) }) step.add_property(PropertyNames.OUTPUT_INFO, outputs)
def serialize_coder(coder): from google.cloud.dataflow.internal import pickler return '%s$%s' % (coder.__class__.__name__, pickler.dumps(coder))
def run_Read(self, transform_node): transform = transform_node.transform step = self._add_step(TransformNames.READ, transform_node.full_label, transform_node) # TODO(mairbek): refactor if-else tree to use registerable functions. # Initialize the source specific properties. if not hasattr(transform.source, 'format'): # If a format is not set, we assume the source to be a custom source. source_dict = dict() spec_dict = dict() spec_dict[names.SERIALIZED_SOURCE_KEY] = pickler.dumps( transform.source) spec_dict['@type'] = names.SOURCE_TYPE source_dict['spec'] = spec_dict step.add_property(PropertyNames.SOURCE_STEP_INPUT, source_dict) elif transform.source.format == 'text': step.add_property(PropertyNames.FILE_PATTERN, transform.source.path) elif transform.source.format == 'bigquery': # TODO(silviuc): Add table validation if transform.source.validate. if transform.source.table_reference is not None: step.add_property(PropertyNames.BIGQUERY_DATASET, transform.source.table_reference.datasetId) step.add_property(PropertyNames.BIGQUERY_TABLE, transform.source.table_reference.tableId) # If project owning the table was not specified then the project owning # the workflow (current project) will be used. if transform.source.table_reference.projectId is not None: step.add_property( PropertyNames.BIGQUERY_PROJECT, transform.source.table_reference.projectId) elif transform.source.query is not None: step.add_property(PropertyNames.BIGQUERY_QUERY, transform.source.query) else: raise ValueError( 'BigQuery source %r must specify either a table or' ' a query', transform.source) elif transform.source.format == 'pubsub': standard_options = (transform_node.inputs[0].pipeline.options. view_as(StandardOptions)) if not standard_options.streaming: raise ValueError( 'PubSubSource is currently available for use only in ' 'streaming pipelines.') step.add_property(PropertyNames.PUBSUB_TOPIC, transform.source.topic) if transform.source.subscription: step.add_property(PropertyNames.PUBSUB_SUBSCRIPTION, transform.source.topic) if transform.source.id_label: step.add_property(PropertyNames.PUBSUB_ID_LABEL, transform.source.id_label) else: raise ValueError('Source %r has unexpected format %s.' % (transform.source, transform.source.format)) if not hasattr(transform.source, 'format'): step.add_property(PropertyNames.FORMAT, names.SOURCE_FORMAT) else: step.add_property(PropertyNames.FORMAT, transform.source.format) if isinstance(transform.source, iobase.BoundedSource): coder = transform.source.default_output_coder() else: coder = transform.source.coder step.encoding = self._get_cloud_encoding(coder) step.add_property(PropertyNames.OUTPUT_INFO, [{ PropertyNames.USER_NAME: ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT }])