def test_labels(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp' ]) job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) self.assertIsNone(job.proto.labels) pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', '--label', 'key1=value1', '--label', 'key2', '--label', 'key3=value3', '--labels', 'key4=value4', '--labels', 'key5' ]) job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) self.assertEqual(5, len(job.proto.labels.additionalProperties)) self.assertEqual('key1', job.proto.labels.additionalProperties[0].key) self.assertEqual('value1', job.proto.labels.additionalProperties[0].value) self.assertEqual('key2', job.proto.labels.additionalProperties[1].key) self.assertEqual('', job.proto.labels.additionalProperties[1].value) self.assertEqual('key3', job.proto.labels.additionalProperties[2].key) self.assertEqual('value3', job.proto.labels.additionalProperties[2].value) self.assertEqual('key4', job.proto.labels.additionalProperties[3].key) self.assertEqual('value4', job.proto.labels.additionalProperties[3].value) self.assertEqual('key5', job.proto.labels.additionalProperties[4].key) self.assertEqual('', job.proto.labels.additionalProperties[4].value)
def run(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient self.job = apiclient.Job(pipeline.options) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run(pipeline) standard_options = pipeline.options.view_as(StandardOptions) if standard_options.streaming: job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION else: job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline.options, job_version) # Create the job result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result) result.metric_results = self._metrics return result
def test_graph_is_uploaded(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', '--experiments', 'beam_fn_api', '--experiments', 'upload_graph' ]) job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) client = apiclient.DataflowApplicationClient(pipeline_options) with mock.patch.object(client, 'stage_file', side_effect=None): with mock.patch.object(client, 'create_job_description', side_effect=None): with mock.patch.object(client, 'submit_job_description', side_effect=None): client.create_job(job) client.stage_file.assert_called_once_with( mock.ANY, "dataflow_graph.json", mock.ANY) client.create_job_description.assert_called_once()
def test_translate_portable_job_step_name(self): mock_client, mock_job_result = self.setup_mock_client_result( self.ONLY_COUNTERS_LIST) pipeline_options = PipelineOptions([ '--experiments=use_runner_v2', '--experiments=use_portable_job_submission', '--temp_location=gs://any-location/temp', '--project=dummy_project', ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment( container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) job = apiclient.Job(pipeline_options, proto_pipeline) dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result, job) self.assertEqual( 'MyTestParDo', dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14'))
def test_create_job_returns_existing_job(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', ]) job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) self.assertTrue(job.proto.clientRequestId) # asserts non-empty string pipeline_options.view_as(GoogleCloudOptions).no_auth = True client = apiclient.DataflowApplicationClient(pipeline_options) response = dataflow.Job() # different clientRequestId from `job` response.clientRequestId = "20210821081910123456-1234" response.name = 'test_job_name' response.id = '2021-08-19_21_18_43-9756917246311111021' with mock.patch.object(client._client.projects_locations_jobs, 'Create', side_effect=[response]): with mock.patch.object(client, 'create_job_description', side_effect=None): with self.assertRaises( apiclient.DataflowJobAlreadyExistsError) as context: client.create_job(job) self.assertEqual( str(context.exception), 'There is already active job named %s with id: %s. If you want to ' 'submit a second job, try again by setting a different name using ' '--job_name.' % ('test_job_name', response.id))
def test_transform_name_mapping(self, mock_job): pipeline_options = PipelineOptions( ['--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', '--update', '--transform_name_mapping', '{\"from\":\"to\"}']) job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) self.assertIsNotNone(job.proto.transformNameMapping)
def test_created_from_snapshot_id(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', '--create_from_snapshot', 'test_snapshot_id' ]) job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) self.assertEqual('test_snapshot_id', job.proto.createdFromSnapshotId)
def run_pipeline(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError( 'Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') # Snapshot the pipeline in a portable proto before mutating it proto_pipeline, self.proto_context = pipeline.to_runner_api( return_context=True) # Performing configured PTransform overrides. pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) # Add setup_options for all the BeamPlugin imports setup_options = pipeline._options.view_as(SetupOptions) plugins = BeamPlugin.get_all_plugin_paths() if setup_options.beam_plugins is not None: plugins = list(set(plugins + setup_options.beam_plugins)) setup_options.beam_plugins = plugins self.job = apiclient.Job(pipeline._options, proto_pipeline) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run_pipeline(pipeline) test_options = pipeline._options.view_as(TestOptions) # If it is a dry run, return without submitting the job. if test_options.dry_run: return None # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options) # Create the job description and send a request to the service. The result # can be None if there is no need to send a request to the service (e.g. # template creation). If a request was sent and failed then the call will # raise an exception. result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result
def run(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError( 'Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') # Performing configured PTransform overrides. pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) # Add setup_options for all the BeamPlugin imports setup_options = pipeline._options.view_as(SetupOptions) plugins = BeamPlugin.get_all_plugin_paths() if setup_options.beam_plugins is not None: plugins = list(set(plugins + setup_options.beam_plugins)) setup_options.beam_plugins = plugins self.job = apiclient.Job(pipeline._options) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run(pipeline) test_options = pipeline._options.view_as(TestOptions) # If it is a dry run, return without submitting the job. if test_options.dry_run: return None standard_options = pipeline._options.view_as(StandardOptions) if standard_options.streaming: job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION else: job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options, job_version) # Create the job result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result
def test_remote_runner_translation(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) remote_runner.job = apiclient.Job(p.options) super(DataflowRunner, remote_runner).run(p)
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return {'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now} class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) remote_runner.job = apiclient.Job(p.options) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) steps = [step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0] step = steps[1] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key']) nspace = SpecialParDo.__module__+ '.' expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time'}, {'type': 'STRING', 'namespace': nspace+'SpecialParDo', 'value': nspace+'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo'}, {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn', 'value': 42, 'key': 'dofn_value'}] expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)
def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline( remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth=True' ])) remote_runner.job = apiclient.Job(pipeline._options) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned remote_runner.job = apiclient.Job(p._options) # Performing configured PTransform overrides here. p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_update_job_returns_existing_job(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', '--region', 'us-central1', '--update', ]) replace_job_id = '2021-08-21_00_00_01-6081497447916622336' with mock.patch('apache_beam.runners.dataflow.internal.apiclient.Job.' 'job_id_for_name', return_value=replace_job_id) as job_id_for_name_mock: job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) job_id_for_name_mock.assert_called_once() self.assertTrue(job.proto.clientRequestId) # asserts non-empty string pipeline_options.view_as(GoogleCloudOptions).no_auth = True client = apiclient.DataflowApplicationClient(pipeline_options) response = dataflow.Job() # different clientRequestId from `job` response.clientRequestId = "20210821083254123456-1234" response.name = 'test_job_name' response.id = '2021-08-19_21_29_07-5725551945600207770' with mock.patch.object(client, 'create_job_description', side_effect=None): with mock.patch.object(client._client.projects_locations_jobs, 'Create', side_effect=[response]): with self.assertRaises( apiclient.DataflowJobAlreadyExistsError) as context: client.create_job(job) self.assertEqual( str(context.exception), 'The job named %s with id: %s has already been updated into job ' 'id: %s and cannot be updated again.' % ('test_job_name', replace_job_id, response.id))
def test_template_file_generation_with_upload_graph(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', '--experiments', 'upload_graph', '--template_location', 'gs://test-location/template' ]) job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) job.proto.steps.append(dataflow.Step(name='test_step_name')) pipeline_options.view_as(GoogleCloudOptions).no_auth = True client = apiclient.DataflowApplicationClient(pipeline_options) with mock.patch.object(client, 'stage_file', side_effect=None): with mock.patch.object(client, 'create_job_description', side_effect=None): with mock.patch.object(client, 'submit_job_description', side_effect=None): client.create_job(job) client.stage_file.assert_has_calls([ mock.call(mock.ANY, 'dataflow_graph.json', mock.ANY), mock.call(mock.ANY, 'template', mock.ANY) ]) client.create_job_description.assert_called_once() # template is generated, but job should not be submitted to the # service. client.submit_job_description.assert_not_called() template_filename = client.stage_file.call_args_list[-1][0][1] self.assertTrue('template' in template_filename) template_content = client.stage_file.call_args_list[-1][0][2].read( ).decode('utf-8') template_obj = json.loads(template_content) self.assertFalse(template_obj.get('steps')) self.assertTrue(template_obj['stepsLocation'])
def run(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError( 'Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') self.job = apiclient.Job(pipeline._options) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run(pipeline) standard_options = pipeline._options.view_as(StandardOptions) if standard_options.streaming: job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION else: job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options, job_version) # Create the job result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result