def test_full_completion(self): # Create dummy file and close it. Note that we need to do this because # Windows does not allow NamedTemporaryFiles to be reopened elsewhere # before the temporary file is closed. dummy_file = tempfile.NamedTemporaryFile(delete=False) dummy_file_name = dummy_file.name dummy_file.close() dummy_dir = tempfile.mkdtemp() remote_runner = DataflowRunner() pipeline = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_file_name, '--job_name=test-job', '--project=test-project', '--staging_location=' + dummy_dir, '--temp_location=/dev/null', '--template_location=' + dummy_file_name, '--no_auth=True'])) pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned pipeline.run().wait_until_finish() with open(dummy_file_name) as template_file: saved_job_dict = json.load(template_file) self.assertEqual( saved_job_dict['environment']['sdkPipelineOptions'] ['options']['project'], 'test-project') self.assertEqual( saved_job_dict['environment']['sdkPipelineOptions'] ['options']['job_name'], 'test-job')
def test_direct_runner_metrics(self): class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): gauge = Metrics.gauge(self.__class__, 'latest_element') gauge.set(element) count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] runner = DirectRunner() p = Pipeline(runner, options=PipelineOptions(self.default_properties)) pcoll = (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) assert_that(pcoll, equal_to([1, 2, 3, 4, 5])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5))))) gauge_result = metrics['gauges'][0] hc.assert_that( gauge_result.key, hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element')))) hc.assert_that(gauge_result.committed.value, hc.equal_to(5)) hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
def test_biqquery_read_streaming_fail(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) _ = p | beam.io.Read(beam.io.BigQuerySource('some.table')) with self.assertRaisesRegexp(ValueError, r'source is not currently available'): p.run()
def test_remote_runner_translation(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run()
def run(self, transform, options=None): """Run the given transform with this runner. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) p | transform return p.run()
def test_parent_pointer(self): class MyPTransform(beam.PTransform): def expand(self, p): self.p = p return p | beam.Create([None]) p = beam.Pipeline() p | MyPTransform() # pylint: disable=expression-not-assigned p = Pipeline.from_runner_api(Pipeline.to_runner_api(p), None, None) self.assertIsNotNone(p.transforms_stack[0].parts[0].parent) self.assertEquals(p.transforms_stack[0].parts[0].parent, p.transforms_stack[0])
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return {'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now} class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) p.run() job_dict = json.loads(str(remote_runner.job)) steps = [step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0] step = steps[1] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key']) nspace = SpecialParDo.__module__+ '.' expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time'}, {'type': 'STRING', 'namespace': nspace+'SpecialParDo', 'value': nspace+'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo'}, {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn', 'value': 42, 'key': 'dofn_value'}] expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)
def run(self, transform, options=None): """Run the given transform or callable with this runner. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import PTransform from apache_beam.pvalue import PBegin from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) if isinstance(transform, PTransform): p | transform else: transform(PBegin(p)) return p.run()
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_reuse_custom_transform_instance(self): pipeline = Pipeline() pcoll1 = pipeline | 'pcoll1' >> Create([1, 2, 3]) pcoll2 = pipeline | 'pcoll2' >> Create([4, 5, 6]) transform = PipelineTest.CustomTransform() pcoll1 | transform with self.assertRaises(RuntimeError) as cm: pipeline.apply(transform, pcoll2) self.assertEqual( cm.exception.args[0], 'Transform "CustomTransform" does not have a stable unique label. ' 'This will prevent updating of pipelines. ' 'To apply a transform with a specified label write ' 'pvalue | "label" >> transform')
def setUp(self): self.pipeline = Pipeline(DirectRunner()) self.visitor = ConsumerTrackingPipelineVisitor() try: # Python 2 self.assertCountEqual = self.assertItemsEqual except AttributeError: # Python 3 pass
def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth=True'])) remote_runner.job = apiclient.Job(pipeline._options) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned remote_runner.job = apiclient.Job(p._options) # Performing configured PTransform overrides here. p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def run(self, request, context): job_id = uuid.uuid4().get_hex() pipeline_result = Pipeline.from_runner_api( request.pipeline, 'DirectRunner', PipelineOptions()).run() self.jobs[job_id] = pipeline_result return beam_job_api_pb2.SubmitJobResponse(jobId=job_id)
def test_visit_entire_graph(self): pipeline = Pipeline() pcoll1 = pipeline | 'pcoll' >> Create([1, 2, 3]) pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1]) pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1]) transform = PipelineTest.CustomTransform() pcoll5 = pcoll4 | transform visitor = PipelineTest.Visitor(visited=[]) pipeline.visit(visitor) self.assertEqual(set([pcoll1, pcoll2, pcoll3, pcoll4, pcoll5]), set(visitor.visited)) self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite)) self.assertEqual(3, len(visitor.enter_composite)) self.assertEqual(visitor.enter_composite[2].transform, transform) self.assertEqual(visitor.leave_composite[1].transform, transform)
def run_async(self, transform, options=None): """Run the given transform or callable with this runner. May return immediately, executing the pipeline in the background. The returned result object can be queried for progress, and `wait_until_finish` may be called to block until completion. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import PTransform from apache_beam.pvalue import PBegin from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) if isinstance(transform, PTransform): p | transform else: transform(PBegin(p)) return p.run()
def test_use_fastavro_experiment_is_added_on_py3_and_onwards(self): remote_runner = DataflowRunner() with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned self.assertEqual( sys.version_info[0] > 2, remote_runner.job.options.view_as(DebugOptions).lookup_experiment( 'use_fastavro', False))
def test_biqquery_read_fn_api_fail(self): remote_runner = DataflowRunner() for flag in ['beam_fn_api', 'use_unified_worker', 'use_runner_v2']: self.default_properties.append("--experiments=%s" % flag) with self.assertRaisesRegex( ValueError, 'The Read.BigQuerySource.*is not supported.*' 'apache_beam.io.gcp.bigquery.ReadFromBigQuery.*'): with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: _ = p | beam.io.Read(beam.io.BigQuerySource('some.table'))
def test_min_cpu_platform_flag_is_propagated_to_experiments(self): remote_runner = DataflowRunner() self.default_properties.append('--min_cpu_platform=Intel Haswell') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned self.assertIn( 'min_cpu_platform=Intel Haswell', remote_runner.job.options.view_as(DebugOptions).experiments)
def test_pipeline_sdk_not_overridden(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned proto_pipeline, _ = pipeline.to_runner_api(return_context=True) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_prefix/dummy_name:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict(), pipeline_options) self.assertIsNotNone(2, len(proto_pipeline.components.environments)) from apache_beam.utils import proto_utils found_override = False for env in proto_pipeline.components.environments.values(): docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) if docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY): found_override = True self.assertFalse(found_override)
def test_environment_override_translation(self): self.default_properties.append('--experiments=beam_fn_api') self.default_properties.append('--worker_harness_container_image=FOO') remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run() self.assertEqual( list( remote_runner.proto_pipeline.components.environments.values()), [ beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image='FOO').SerializeToString()) ])
def test_remote_runner_translation(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) remote_runner.job = apiclient.Job(p.options) super(DataflowRunner, remote_runner).run(p)
def test_upload_graph_experiment(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=upload_graph') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('upload_graph', experiments_for_job)
def test_use_fastavro_experiment_is_not_added_when_use_avro_is_present(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=use_avro') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned debug_options = remote_runner.job.options.view_as(DebugOptions) self.assertFalse(debug_options.lookup_experiment('use_fastavro', False))
def test_unsupported_fnapi_features(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=beam_fn_api') self.default_properties.append('--experiment=use_runner_v2') with self.assertRaisesRegex(RuntimeError, 'Unsupported merging'): with Pipeline(remote_runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([]) | beam.WindowInto(CustomMergingWindowFn()) with self.assertRaisesRegex(RuntimeError, 'Unsupported window coder'): with Pipeline(remote_runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned p | beam.Create([]) | beam.WindowInto( CustomWindowTypeWindowFn())
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) # TODO(BEAM-366) Enable runner API on this test. p.run(test_runner_api=False) job_dict = json.loads(str(remote_runner.job)) steps = [ step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0 ] step = steps[1] disp_data = step['properties']['display_data'] nspace = SpecialParDo.__module__ + '.' expected_data = [{ 'type': 'TIMESTAMP', 'namespace': nspace + 'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time' }, { 'type': 'STRING', 'namespace': nspace + 'SpecialParDo', 'value': nspace + 'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo' }, { 'type': 'INTEGER', 'namespace': nspace + 'SpecialDoFn', 'value': 42, 'key': 'dofn_value' }] self.assertUnhashableCountEqual(disp_data, expected_data)
def test_simple(self): """Tests serializing, deserializing, and running a simple pipeline. More extensive tests are done at pipeline.run for each suitable test. """ p = beam.Pipeline() p | beam.Create([None]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned proto = p.to_runner_api() p2 = Pipeline.from_runner_api(proto, p.runner, p.options) p2.run()
def test_simple(self): """Tests serializing, deserializing, and running a simple pipeline. More extensive tests are done at pipeline.run for each suitable test. """ p = beam.Pipeline() p | beam.Create([None]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned proto = p.to_runner_api() p2 = Pipeline.from_runner_api(proto, p.runner, p._options) p2.run()
def test_transform_ids(self): class MyPTransform(beam.PTransform): def expand(self, p): self.p = p return p | beam.Create([None]) p = beam.Pipeline() p | MyPTransform() # pylint: disable=expression-not-assigned runner_api_proto = Pipeline.to_runner_api(p) for transform_id in runner_api_proto.components.transforms: self.assertRegex(transform_id, r'[a-zA-Z0-9-_]+')
def test_dataflow_worker_jar_flag_adds_use_staged_worker_jar_experiment(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=beam_fn_api') self.default_properties.append('--dataflow_worker_jar=test.jar') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('beam_fn_api', experiments_for_job) self.assertIn('use_staged_dataflow_worker_jar', experiments_for_job)
def test_ptransform_override_replacement_inputs(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return (isinstance(applied_ptransform.transform, ParDo) and isinstance(applied_ptransform.transform.fn, AddWithProductDoFn)) def get_replacement_transform(self, transform): return AddThenMultiply() def get_replacement_inputs(self, applied_ptransform): assert len(applied_ptransform.inputs) == 1 assert len(applied_ptransform.side_inputs) == 2 # Swap the order of the two side inputs return (applied_ptransform.inputs[0], applied_ptransform.side_inputs[1].pvalue, applied_ptransform.side_inputs[0].pvalue) p = Pipeline() pcoll1 = p | 'pc1' >> beam.Create([2]) pcoll2 = p | 'pc2' >> beam.Create([3]) pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6]) result = pcoll3 | 'Operate' >> beam.ParDo( AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2)) assert_that(result, equal_to([14, 16, 18])) p.replace_all([MyParDoOverride()]) p.run()
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return {'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now} class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) remote_runner.job = apiclient.Job(p.options) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) steps = [step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0] step = steps[0] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key']) nspace = SpecialParDo.__module__+ '.' expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time'}, {'type': 'STRING', 'namespace': nspace+'SpecialParDo', 'value': nspace+'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo'}, {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn', 'value': 42, 'key': 'dofn_value'}] expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)
def test_sdk_harness_container_image_overrides(self): if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__: _LOGGER.warning( 'Skipping test \'test_sdk_harness_container_image_overrides\' since ' 'Dataflow API WorkerPool does not have attribute ' '\'sdkHarnessContainerImages\'') return pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--project', 'dummy_project', '--sdk_harness_container_image_overrides', '.*dummy.*,new_dummy_container_image', ]) pipeline = Pipeline(options=pipeline_options) test_environment = DockerEnvironment( container_image='dummy_container_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dataflow_client = apiclient.DataflowApplicationClient(pipeline_options) # Accessing non-public method for testing. dataflow_client._apply_sdk_environment_overrides(proto_pipeline) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertEqual(docker_payload.container_image, 'new_dummy_container_image')
def run_async( self, transform, # type: PTransform options=None # type: Optional[PipelineOptions] ): # type: (...) -> PipelineResult """Run the given transform or callable with this runner. May return immediately, executing the pipeline in the background. The returned result object can be queried for progress, and `wait_until_finish` may be called to block until completion. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import PTransform from apache_beam.pvalue import PBegin from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) if isinstance(transform, PTransform): p | transform else: transform(PBegin(p)) return p.run()
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 3) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo') self.assertEqual(job_dict[u'steps'][2][u'kind'], u'ParallelDo')
def test_streaming_engine_flag_adds_windmill_experiments(self): remote_runner = DataflowRunner() self.default_properties.append('--streaming') self.default_properties.append('--enable_streaming_engine') self.default_properties.append('--experiment=some_other_experiment') with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p: p | ptransform.Create([1]) # pylint: disable=expression-not-assigned experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('enable_streaming_engine', experiments_for_job) self.assertIn('enable_windmill_service', experiments_for_job) self.assertIn('some_other_experiment', experiments_for_job)
def test_no_group_by_key_directly_after_bigquery(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--no_auth' ])) rows = p | beam.io.Read(beam.io.BigQuerySource('dataset.faketable')) with self.assertRaises(ValueError, msg=('Coder for the GroupByKey operation' '"GroupByKey" is not a key-value coder: ' 'RowAsDictJsonCoder')): unused_invalid = rows | beam.GroupByKey()
def test_after_count(self): p = Pipeline('DirectRunner') result = (p | beam.Create([1, 2, 3, 4, 5, 10, 11]) | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | beam.Map(lambda (k, t): TimestampedValue((k, t), t)) | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v)))) assert_that(result, equal_to( { 'A-5': {1, 2, 3, 4, 5}, # A-10, A-11 never emitted due to AfterCount(3) never firing. 'B-4': {6, 7, 8, 9}, 'B-3': {10, 15, 16}, }.iteritems()))
def test_pcollectionview_not_recreated(self): pipeline = Pipeline('DirectRunner') value = pipeline | 'create1' >> Create([1, 2, 3]) value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)]) value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)]) self.assertEqual(AsSingleton(value), AsSingleton(value)) self.assertEqual(AsSingleton('new', value, default_value=1), AsSingleton('new', value, default_value=1)) self.assertNotEqual(AsSingleton(value), AsSingleton('new', value, default_value=1)) self.assertEqual(AsIter(value), AsIter(value)) self.assertEqual(AsList(value), AsList(value)) self.assertEqual(AsDict(value2), AsDict(value2)) self.assertNotEqual(AsSingleton(value), AsSingleton(value2)) self.assertNotEqual(AsIter(value), AsIter(value2)) self.assertNotEqual(AsList(value), AsList(value2)) self.assertNotEqual(AsDict(value2), AsDict(value3))
def test_ptransform_overrides(self, file_system_override_mock): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r' % ptransform) def get_overrides(unused_pipeline_options): return [MyParDoOverride()] file_system_override_mock.side_effect = get_overrides # Specify DirectRunner as it's the one patched above. with Pipeline(runner='BundleBasedDirectRunner') as p: pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9]))
def test_ptransform_overrides(self): def my_par_do_matcher(applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) class MyParDoOverride(PTransformOverride): def get_matcher(self): return my_par_do_matcher def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r', ptransform) # Using following private variable for testing. DirectRunner._PTRANSFORM_OVERRIDES.append(MyParDoOverride()) with Pipeline() as p: pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9]))
def test_sdk_harness_container_image_overrides(self): test_environment = DockerEnvironment( container_image='dummy_container_image') proto_pipeline, _ = Pipeline().to_runner_api( return_context=True, default_environment=test_environment) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, {'.*dummy.*': 'new_dummy_container_image'}) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertEqual( docker_payload.container_image, 'new_dummy_container_image')
def test_dataflow_container_image_override(self): test_environment = DockerEnvironment( container_image='apache/beam_java11_sdk:x.yz.0') proto_pipeline, _ = Pipeline().to_runner_api( return_context=True, default_environment=test_environment) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict()) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertTrue( docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY))
def test_visitor_not_sorted(self): p = Pipeline() # pylint: disable=expression-not-assigned from apache_beam.testing.test_stream import TestStream p | TestStream().add_elements(['']) | beam.Map(lambda _: _) original_graph = p.to_runner_api(return_context=False) out_of_order_graph = p.to_runner_api(return_context=False) root_id = out_of_order_graph.root_transform_ids[0] root = out_of_order_graph.components.transforms[root_id] tmp = root.subtransforms[0] root.subtransforms[0] = root.subtransforms[1] root.subtransforms[1] = tmp p = beam.Pipeline().from_runner_api(out_of_order_graph, runner='BundleBasedDirectRunner', options=None) v_out_of_order = ConsumerTrackingPipelineVisitor() p.visit(v_out_of_order) p = beam.Pipeline().from_runner_api(original_graph, runner='BundleBasedDirectRunner', options=None) v_original = ConsumerTrackingPipelineVisitor() p.visit(v_original) # Convert to string to assert they are equal. out_of_order_labels = { str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]] for k in v_out_of_order.value_to_consumers } original_labels = { str(k): [str(t) for t in v_original.value_to_consumers[k]] for k in v_original.value_to_consumers } self.assertDictEqual(out_of_order_labels, original_labels)
def test_ptransform_override_multiple_inputs(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, FlattenAndDouble) def get_replacement_transform(self, applied_ptransform): return FlattenAndTriple() p = Pipeline() pcoll1 = p | 'pc1' >> beam.Create([1, 2, 3]) pcoll2 = p | 'pc2' >> beam.Create([4, 5, 6]) pcoll3 = (pcoll1, pcoll2) | 'FlattenAndMultiply' >> FlattenAndDouble() assert_that(pcoll3, equal_to([3, 6, 9, 12, 15, 18])) p.replace_all([MyParDoOverride()]) p.run()
def test_ptransform_overrides(self, file_system_override_mock): def my_par_do_matcher(applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) class MyParDoOverride(PTransformOverride): def get_matcher(self): return my_par_do_matcher def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r', ptransform) def get_overrides(): return [MyParDoOverride()] file_system_override_mock.side_effect = get_overrides with Pipeline() as p: pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9]))
def test_ptransform_overrides(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r' % ptransform) p = Pipeline() pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9])) p.replace_all([MyParDoOverride()]) p.run()
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline(DirectRunner()) self.visitor = ConsumerTrackingPipelineVisitor() def test_root_transforms(self): class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) root_flatten = Flatten(pipeline=self.pipeline) pbegin = pvalue.PBegin(self.pipeline) pcoll_read = pbegin | 'read' >> root_read pcoll_read | FlatMap(lambda x: x) [] | 'flatten' >> root_flatten self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted( [root_read, root_flatten])) pbegin_consumers = sorted( [c.transform for c in self.visitor.value_to_consumers[pbegin]]) self.assertEqual(pbegin_consumers, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3) def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.OutputValue('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result positive | ParDo(ProcessNumbersFn(), AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList)) def test_co_group_by_key(self): emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**')]) phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')]) {'emails': emails, 'phones': phones} | CoGroupByKey() self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(len(root_transforms), 2) self.assertGreater( len(self.visitor.step_names), 3) # 2 creates + expanded CoGBK self.assertEqual(len(self.visitor.views), 0)
def setUp(self): self.pipeline = Pipeline(DirectRunner()) self.visitor = ConsumerTrackingPipelineVisitor()