def test_create_application_client(self): pipeline_options = PipelineOptions() apiclient.DataflowApplicationClient(pipeline_options)
def test_create_application_client(self): pipeline_options = PipelineOptions() apiclient.DataflowApplicationClient( pipeline_options, DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION)
def test_stage_resources(self): pipeline_options = PipelineOptions([ '--temp_location', 'gs://test-location/temp', '--staging_location', 'gs://test-location/staging', '--no_auth' ]) pipeline = beam_runner_api_pb2.Pipeline( components=beam_runner_api_pb2.Components( environments={ 'env1': beam_runner_api_pb2.Environment( dependencies=[ beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2. ArtifactFilePayload( path='/tmp/foo1').SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='foo1').SerializeToString()), beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2. ArtifactFilePayload( path='/tmp/bar1').SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='bar1').SerializeToString()) ]), 'env2': beam_runner_api_pb2.Environment( dependencies=[ beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2. ArtifactFilePayload( path='/tmp/foo2').SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='foo2').SerializeToString()), beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.FILE.urn, type_payload=beam_runner_api_pb2. ArtifactFilePayload( path='/tmp/bar2').SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='bar2').SerializeToString()) ]) })) client = apiclient.DataflowApplicationClient(pipeline_options) with mock.patch.object(apiclient._LegacyDataflowStager, 'stage_job_resources') as mock_stager: client._stage_resources(pipeline, pipeline_options) mock_stager.assert_called_once_with( [('/tmp/foo1', 'foo1'), ('/tmp/bar1', 'bar1'), ('/tmp/foo2', 'foo2'), ('/tmp/bar2', 'bar2')], staging_location='gs://test-location/staging') pipeline_expected = beam_runner_api_pb2.Pipeline( components=beam_runner_api_pb2.Components( environments={ 'env1': beam_runner_api_pb2.Environment( dependencies=[ beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.URL.urn, type_payload=beam_runner_api_pb2.ArtifactUrlPayload( url='gs://test-location/staging/foo1' ).SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='foo1').SerializeToString()), beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.URL.urn, type_payload=beam_runner_api_pb2.ArtifactUrlPayload( url='gs://test-location/staging/bar1'). SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='bar1').SerializeToString()) ]), 'env2': beam_runner_api_pb2.Environment( dependencies=[ beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.URL.urn, type_payload=beam_runner_api_pb2.ArtifactUrlPayload( url='gs://test-location/staging/foo2'). SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='foo2').SerializeToString()), beam_runner_api_pb2.ArtifactInformation( type_urn=common_urns.artifact_types.URL.urn, type_payload=beam_runner_api_pb2.ArtifactUrlPayload( url='gs://test-location/staging/bar2'). SerializeToString(), role_urn=common_urns.artifact_roles.STAGING_TO.urn, role_payload=beam_runner_api_pb2. ArtifactStagingToRolePayload( staged_name='bar2').SerializeToString()) ]) })) self.assertEqual(pipeline, pipeline_expected)
def run_pipeline(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError( 'Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') # Convert all side inputs into a form acceptable to Dataflow. pipeline.visit(self.side_input_visitor()) # Snapshot the pipeline in a portable proto before mutating it proto_pipeline, self.proto_context = pipeline.to_runner_api( return_context=True) # TODO(BEAM-2717): Remove once Coders are already in proto. for pcoll in proto_pipeline.components.pcollections.values(): if pcoll.coder_id not in self.proto_context.coders: coder = coders.registry.get_coder(pickler.loads(pcoll.coder_id)) pcoll.coder_id = self.proto_context.coders.get_id(coder) self.proto_context.coders.populate_map(proto_pipeline.components.coders) # Performing configured PTransform overrides. pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) # Add setup_options for all the BeamPlugin imports setup_options = pipeline._options.view_as(SetupOptions) plugins = BeamPlugin.get_all_plugin_paths() if setup_options.beam_plugins is not None: plugins = list(set(plugins + setup_options.beam_plugins)) setup_options.beam_plugins = plugins self.job = apiclient.Job(pipeline._options, proto_pipeline) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run_pipeline(pipeline) test_options = pipeline._options.view_as(TestOptions) # If it is a dry run, return without submitting the job. if test_options.dry_run: return None # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options) # Create the job description and send a request to the service. The result # can be None if there is no need to send a request to the service (e.g. # template creation). If a request was sent and failed then the call will # raise an exception. result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result