def test_no_group_by_key_directly_after_bigquery(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--no_auth=True' ])) rows = p | beam.io.Read(beam.io.BigQuerySource('dataset.faketable')) with self.assertRaises(ValueError, msg=('Coder for the GroupByKey operation' '"GroupByKey" is not a key-value coder: ' 'RowAsDictJsonCoder')): unused_invalid = rows | beam.GroupByKey()
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 3) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo') self.assertEqual(job_dict[u'steps'][2][u'kind'], u'ParallelDo')
def test_streaming_engine_flag_adds_windmill_experiments(self): remote_runner = DataflowRunner() self.default_properties.append('--streaming') self.default_properties.append('--enable_streaming_engine') self.default_properties.append('--experiment=some_other_experiment') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('enable_streaming_engine', experiments_for_job) self.assertIn('enable_windmill_service', experiments_for_job) self.assertIn('some_other_experiment', experiments_for_job)
def test_environment_override_translation(self): self.default_properties.append('--experiments=beam_fn_api') self.default_properties.append('--worker_harness_container_image=FOO') remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run() self.assertEqual( list(remote_runner.proto_pipeline.components.environments.values()), [beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image='FOO').SerializeToString())])
def test_unsupported_fnapi_features(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=beam_fn_api') self.default_properties.append('--experiment=use_runner_v2') with self.assertRaisesRegex(RuntimeError, 'Unsupported merging'): with Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([]) | beam.WindowInto(CustomMergingWindowFn()) with self.assertRaisesRegex(RuntimeError, 'Unsupported window coder'): with Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([]) | beam.WindowInto(CustomWindowTypeWindowFn())
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned remote_runner.job = apiclient.Job(p._options) # Performing configured PTransform overrides here. p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_unsupported_combinefn_fail(self): class CombinerWithNonDefaultSetupTeardown(combiners.CountCombineFn): def setup(self, *args, **kwargs): pass def teardown(self, *args, **kwargs): pass runner = DataflowRunner() with self.assertRaisesRegex( ValueError, 'CombineFn.setup and CombineFn.' 'teardown are not supported'): with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: _ = (p | beam.Create([1]) | beam.CombineGlobally( CombinerWithNonDefaultSetupTeardown()))
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on # this test. p.run(test_runner_api=False) job_dict = json.loads(str(remote_runner.job)) steps = [ step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0 ] step = steps[1] disp_data = step['properties']['display_data'] nspace = SpecialParDo.__module__ + '.' expected_data = [{ 'type': 'TIMESTAMP', 'namespace': nspace + 'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time' }, { 'type': 'STRING', 'namespace': nspace + 'SpecialParDo', 'value': nspace + 'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo' }, { 'type': 'INTEGER', 'namespace': nspace + 'SpecialDoFn', 'value': 42, 'key': 'dofn_value' }] self.assertUnhashableCountEqual(disp_data, expected_data)
def test_gbk_then_flatten_input_visitor(self): p = TestPipeline(runner=DataflowRunner(), options=PipelineOptions(self.default_properties)) none_str_pc = p | 'c1' >> beam.Create({None: 'a'}) none_int_pc = p | 'c2' >> beam.Create({None: 3}) flat = (none_str_pc, none_int_pc) | beam.Flatten() _ = flat | beam.GroupByKey() # This may change if type inference changes, but we assert it here # to make sure the check below is not vacuous. self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint) p.visit(DataflowRunner.group_by_key_input_visitor()) p.visit(DataflowRunner.flatten_input_visitor()) # The dataflow runner requires gbk input to be tuples *and* flatten # inputs to be equal to their outputs. Assert both hold. self.assertIsInstance(flat.element_type, typehints.TupleConstraint) self.assertEqual(flat.element_type, none_str_pc.element_type) self.assertEqual(flat.element_type, none_int_pc.element_type)
def _run_group_into_batches_and_get_step_properties( self, with_sharded_key, additional_properties): self.default_properties.append('--streaming') for property in additional_properties: self.default_properties.append(property) runner = DataflowRunner() with beam.Pipeline(runner=runner, options=PipelineOptions(self.default_properties)) as p: # pylint: disable=expression-not-assigned input = p | beam.Create([('a', 1), ('a', 1), ('b', 3), ('b', 4)]) if with_sharded_key: ( input | beam.GroupIntoBatches.WithShardedKey(2) | beam.Map(lambda key_values: (key_values[0].key, key_values[1]))) step_name = ( u'WithShardedKey/GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)') else: input | beam.GroupIntoBatches(2) step_name = u'GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)' return self._find_step(runner.job, step_name)['properties']
def test_resource_hints_translation(self, memory_hint): runner = DataflowRunner() self.default_properties.append('--resource_hint=accelerator=some_gpu') self.default_properties.append(f'--resource_hint={memory_hint}=20GB') with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned (p | beam.Create([1]) | 'MapWithHints' >> beam.Map(lambda x: x + 1).with_resource_hints( min_ram='10GB', accelerator= 'type:nvidia-tesla-k80;count:1;install-nvidia-drivers')) step = self._find_step(runner.job, 'MapWithHints') self.assertEqual( step['properties']['resource_hints'], { 'beam:resources:min_ram_bytes:v1': '20000000000', 'beam:resources:accelerator:v1': \ 'type%3Anvidia-tesla-k80%3Bcount%3A1%3Binstall-nvidia-drivers' })
def _test_pack_combiners(self, pipeline_options, expect_packed): runner = DataflowRunner() with beam.Pipeline(runner=runner, options=pipeline_options) as p: data = p | beam.Create([10, 20, 30]) _ = data | 'PackableMin' >> beam.CombineGlobally(min) _ = data | 'PackableMax' >> beam.CombineGlobally(max) unpacked_minimum_step_name = 'PackableMin/CombinePerKey/Combine' unpacked_maximum_step_name = 'PackableMax/CombinePerKey/Combine' packed_step_name = ( 'Packed[PackableMin/CombinePerKey, PackableMax/CombinePerKey]/Pack/' 'CombinePerKey(SingleInputTupleCombineFn)/Combine') job_dict = json.loads(str(runner.job)) step_names = set(s[u'properties'][u'user_name'] for s in job_dict[u'steps']) if expect_packed: self.assertNotIn(unpacked_minimum_step_name, step_names) self.assertNotIn(unpacked_maximum_step_name, step_names) self.assertIn(packed_step_name, step_names) else: self.assertIn(unpacked_minimum_step_name, step_names) self.assertIn(unpacked_maximum_step_name, step_names) self.assertNotIn(packed_step_name, step_names)
def test_gbk_translation(self): runner = DataflowRunner() with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([(1, 2)]) | beam.GroupByKey() expected_output_info = [{ "encoding": { "@type": "kind:windowed_value", "component_encodings": [{ "@type": "kind:pair", "component_encodings": [{ "@type": "kind:varint" }, { "@type": "kind:stream", "component_encodings": [{ "@type": "kind:varint" }], "is_stream_like": True }], "is_pair_like": True }, { "@type": "kind:global_window" }], "is_wrapper": True }, "output_name": "out", "user_name": "GroupByKey.out" }] # yapf: disable gbk_step = self._find_step(runner.job, u'GroupByKey') self.assertEqual(gbk_step[u'kind'], u'GroupByKey') self.assertEqual(gbk_step[u'properties']['output_info'], expected_output_info)
def test_get_default_gcp_region_ignores_error(self, patched_environ, patched_processes): runner = DataflowRunner() result = runner.get_default_gcp_region() self.assertIsNone(result)
def test_get_default_gcp_region_from_gcloud(self, patched_environ, patched_processes): runner = DataflowRunner() result = runner.get_default_gcp_region() self.assertEqual(result, 'some-region2')
def test_get_default_gcp_region_no_default_returns_none( self, patched_environ, patched_processes): runner = DataflowRunner() result = runner.get_default_gcp_region() self.assertIsNone(result)
windowed_avg = (windowed_data | "avg1" >> beam.CombinePerKey(beam.combiners.MeanCombineFn()) ) class PrintWindowResults(beam.DoFn): def process(self, element, window=beam.DoFn.WindowParam): new_element = element yield new_element ( windowed_sum | "sum4" >> beam.ParDo(PrintWindowResults()) | "sum5" >> beam.Map(lambda st: '{{"id": {}, "total_steps": {}}}'.format(st[0],st[1])) | "sum6" >> beam.Map(lambda z: bytes(z, "utf-8")) | "sum7" >> beam.io.WriteToPubSub(topic="projects/data228/topics/data228-hw8-out") ) ( windowed_avg | "avg4" >> beam.ParDo(PrintWindowResults()) | "avg5" >> beam.Map(lambda av: '{{"id": {}, "average_steps": {}}}'.format(av[0],av[1])) | "avg6" >> beam.Map(lambda po: bytes(po, "utf-8")) | "avg7" >> beam.io.WriteToPubSub(topic="projects/data228/topics/data228-hw8-out") ) DataflowRunner().run_pipeline(pipeline, options=options)
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return { 'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now } class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) p.run() job_dict = json.loads(str(remote_runner.job)) steps = [ step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0 ] step = steps[1] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace'] + x['key']) nspace = SpecialParDo.__module__ + '.' expected_data = [{ 'type': 'TIMESTAMP', 'namespace': nspace + 'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time' }, { 'type': 'STRING', 'namespace': nspace + 'SpecialParDo', 'value': nspace + 'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo' }, { 'type': 'INTEGER', 'namespace': nspace + 'SpecialDoFn', 'value': 42, 'key': 'dofn_value' }] expected_data = sorted(expected_data, key=lambda x: x['namespace'] + x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)