def __get_workflow_client(self, region): """Based on the region, returns the dataproc workflow client.""" if region == 'global': # use the global configuration self.__print("Using global region configuration") return dataproc_v1.WorkflowTemplateServiceClient() client_transport = wtsgt.WorkflowTemplateServiceGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region)) return dataproc_v1.WorkflowTemplateServiceClient(client_transport)
def test_create_workflow_template(self): # Setup Expected Response id_ = "id3355" name = "name3373707" version = 351608024 expected_response = {"id": id_, "name": name, "version": version} expected_response = workflow_templates_pb2.WorkflowTemplate( **expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup Request parent = client.region_path("[PROJECT]", "[REGION]") template = {} response = client.create_workflow_template(parent, template) assert expected_response == response assert len(channel.requests) == 1 expected_request = workflow_templates_pb2.CreateWorkflowTemplateRequest( parent=parent, template=template) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_list_workflow_templates(self): # Setup Expected Response next_page_token = "" templates_element = {} templates = [templates_element] expected_response = { "next_page_token": next_page_token, "templates": templates } expected_response = workflow_templates_pb2.ListWorkflowTemplatesResponse( **expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup Request parent = client.region_path("[PROJECT]", "[REGION]") paged_list_response = client.list_workflow_templates(parent) resources = list(paged_list_response) assert len(resources) == 1 assert expected_response.templates[0] == resources[0] assert len(channel.requests) == 1 expected_request = workflow_templates_pb2.ListWorkflowTemplatesRequest( parent=parent) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_instantiate_inline_workflow_template_exception(self): # Setup Response error = status_pb2.Status() operation = operations_pb2.Operation( name= "operations/test_instantiate_inline_workflow_template_exception", done=True, ) operation.error.CopyFrom(error) # Mock the API response channel = ChannelStub(responses=[operation]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup Request parent = client.region_path("[PROJECT]", "[REGION]") template = {} response = client.instantiate_inline_workflow_template( parent, template) exception = response.exception() assert exception.errors[0] == error
def test_instantiate_inline_workflow_template(self): # Setup Expected Response expected_response = {} expected_response = empty_pb2.Empty(**expected_response) operation = operations_pb2.Operation( name="operations/test_instantiate_inline_workflow_template", done=True) operation.response.Pack(expected_response) # Mock the API response channel = ChannelStub(responses=[operation]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup Request parent = client.region_path("[PROJECT]", "[REGION]") template = {} response = client.instantiate_inline_workflow_template( parent, template) result = response.result() assert expected_response == result assert len(channel.requests) == 1 expected_request = workflow_templates_pb2.InstantiateInlineWorkflowTemplateRequest( parent=parent, template=template) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_get_workflow_template(self): # Setup Expected Response id_ = "id3355" name_2 = "name2-1052831874" version = 351608024 expected_response = {"id": id_, "name": name_2, "version": version} expected_response = workflow_templates_pb2.WorkflowTemplate( **expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup Request name = client.workflow_template_path("[PROJECT]", "[REGION]", "[WORKFLOW_TEMPLATE]") response = client.get_workflow_template(name) assert expected_response == response assert len(channel.requests) == 1 expected_request = workflow_templates_pb2.GetWorkflowTemplateRequest( name=name) actual_request = channel.requests[0][1] assert expected_request == actual_request
def main( project_id, zone, cluster_name, bucket_name, pyspark_file=None, create_new_cluster=True, global_region=True, ): # [START dataproc_get_workflow_template_client] if global_region: region = "global" # Use the default gRPC global endpoints. dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient() else: region = get_region_from_zone(zone) # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = workflow_template_service_grpc_transport.WorkflowTemplateServiceGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region)) dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient( client_transport) # [END dataproc_get_workflow_template_client] try: spark_file, spark_filename = get_pyspark_file(pyspark_file) upload_pyspark_file(project_id, bucket_name, spark_filename, spark_file) run_workflow( dataproc_workflow_client, project_id, region, zone, bucket_name, spark_filename, cluster_name, ) wait_for_workflow_end() finally: spark_file.close()
def instantiate_inline_workflow_template(project_id, region): """This sample walks a user through submitting a workflow for a Cloud Dataproc using the Python client library. Args: project_id (string): Project to use for running the workflow. region (string): Region where the workflow resources should live. """ # Create a client with the endpoint set to the desired region. workflow_template_client = dataproc.WorkflowTemplateServiceClient( client_options={ 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) }) parent = workflow_template_client.region_path(project_id, region) template = { 'jobs': [{ 'hadoop_job': { 'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/' 'hadoop-mapreduce-examples.jar', 'args': ['teragen', '1000', 'hdfs:///gen/'] }, 'step_id': 'teragen' }, { 'hadoop_job': { 'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/' 'hadoop-mapreduce-examples.jar', 'args': ['terasort', 'hdfs:///gen/', 'hdfs:///sort/'] }, 'step_id': 'terasort', 'prerequisite_step_ids': ['teragen'] }], 'placement': { 'managed_cluster': { 'cluster_name': 'my-managed-cluster', 'config': { 'gce_cluster_config': { # Leave 'zone_uri' empty for 'Auto Zone Placement' # 'zone_uri': '' 'zone_uri': 'us-central1-a' } } } } } # Submit the request to instantiate the workflow from an inline template. operation = workflow_template_client.instantiate_inline_workflow_template( parent, template) operation.result() # Output a success message. print('Workflow ran successfully.')
def test_list_workflow_templates_exception(self): channel = ChannelStub(responses=[CustomException()]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup request parent = client.region_path("[PROJECT]", "[REGION]") paged_list_response = client.list_workflow_templates(parent) with pytest.raises(CustomException): list(paged_list_response)
def test_update_workflow_template_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup request template = {} with pytest.raises(CustomException): client.update_workflow_template(template)
def test_delete_workflow_template_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup request name = client.workflow_template_path("[PROJECT]", "[REGION]", "[WORKFLOW_TEMPLATE]") with pytest.raises(CustomException): client.delete_workflow_template(name)
def test_delete_workflow_template(self): channel = ChannelStub() patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = dataproc_v1.WorkflowTemplateServiceClient() # Setup Request name = client.workflow_template_path("[PROJECT]", "[REGION]", "[WORKFLOW_TEMPLATE]") client.delete_workflow_template(name) assert len(channel.requests) == 1 expected_request = workflow_templates_pb2.DeleteWorkflowTemplateRequest( name=name) actual_request = channel.requests[0][1] assert expected_request == actual_request
def instantiate_inline_workflow_template(project_id, region): """This sample walks a user through submitting a workflow for a Cloud Dataproc using the Python client library. Args: project_id (string): Project to use for running the workflow. region (string): Region where the workflow resources should live. """ # Create a client with the endpoint set to the desired region. workflow_template_client = dataproc.WorkflowTemplateServiceClient( client_options={ "api_endpoint": f"{region}-dataproc.googleapis.com:443" }) parent = "projects/{}/regions/{}".format(project_id, region) template = { "jobs": [ { "hadoop_job": { "main_jar_file_uri": "file:///usr/lib/hadoop-mapreduce/" "hadoop-mapreduce-examples.jar", "args": ["teragen", "1000", "hdfs:///gen/"], }, "step_id": "teragen", }, { "hadoop_job": { "main_jar_file_uri": "file:///usr/lib/hadoop-mapreduce/" "hadoop-mapreduce-examples.jar", "args": ["terasort", "hdfs:///gen/", "hdfs:///sort/"], }, "step_id": "terasort", "prerequisite_step_ids": ["teragen"], }, ], "placement": { "managed_cluster": { "cluster_name": "my-managed-cluster", "config": { "gce_cluster_config": { # Leave 'zone_uri' empty for 'Auto Zone Placement' # 'zone_uri': '' "zone_uri": "us-central1-a" } }, } }, } # Submit the request to instantiate the workflow from an inline template. operation = workflow_template_client.instantiate_inline_workflow_template( request={ "parent": parent, "template": template }) operation.result() # Output a success message. print("Workflow ran successfully.")
def trigger_dataproc_jobs(message, context): """ Entry point for the CloudFunction Captures a Pubsub message from the configured source topic and constructs a Dataproc Inline Workflow request to run the jobs specified in the message request. message: the Pubsub message context: the Cloud Function context information """ if not 'data' in message: print("no data in the Pubsub message, nothing to do...") return event = json.loads(base64.b64decode(message['data']).decode('utf-8')) if not "jobs" in event.keys(): print("jobs property not present in the event, no work to be done...") return # initialize needed GCP clients wf_client_transport = ( workflow_template_service_grpc_transport. WorkflowTemplateServiceGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region_id))) dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient( wf_client_transport) dp_client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region_id))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( dp_client_transport) storage_client = storage.Client() # retrieves the cloud function configuration for storage config = retrieve_configuration(storage_client) # build parent region path for dataproc api requests parent = dataproc_workflow_client.region_path(project_id, region_id) # extract events parameters zone = event.get('zone', zone_id) job_name = event.get('job_name', 'dataproc-workflow-test') template_name = "projects/{}/regions/{}/workflowTemplates/{}".format( project_id, region_id, job_name) cluster_name = 'cluster-' + job_name cluster_init_actions = event.get('cluster_init_actions', []) request_id = event.get('request_id', template_name.replace('/', '_')) job_labels = event.get('labels', {}) job_labels['job_name'] = job_name job_labels['request_id'] = request_id req_metadata = event.get('metadata', {}) # lets check if there is another cluster with the same labels already running # randomizing the wait time we can improve the chances of catching duplicated requests time.sleep(random.randint(1, 5)) for cluster in dataproc_cluster_client.list_clusters( project_id, region_id, 'labels.job_name = {} AND labels.request_id = {}'.format( job_name, request_id)): print( "workflow instance already running for same pair job_name and request_id ({},{}), exiting" .format(job_name, request_id)) return if not isinstance(cluster_init_actions, list): print("cluster initialization actions should be a list") return # check on the functions configuration for an entry for the job name in the execution request cluster_config = None if not job_name in config.keys(): # if no particular configuration exists use default one cluster_config = config['default_cluster_config'] else: cluster_config = config[job_name] cluster_config['labels'] = {**cluster_config['labels'], **job_labels} cluster_config['cluster_name'] = cluster_name cluster_config['config']['gce_cluster_config']['metadata'] = { **cluster_config['config']['gce_cluster_config']['metadata'], **req_metadata } cluster_config['config']['gce_cluster_config']['zone_uri'] = zone cluster_config['config']['initialization_actions'] = cluster_config[ 'config']['initialization_actions'] + cluster_init_actions for action in cluster_config['config']['initialization_actions']: if 'execution_timeout' in action: timeout = Duration(seconds=action['execution_timeout']) action['execution_timeout'] = timeout # creates inline template request inline_template = { 'name': template_name, 'placement': { 'managed_cluster': cluster_config }, 'jobs': event['jobs'] } # sends the request to instantiate the workflow inlined template response = dataproc_workflow_client.instantiate_inline_workflow_template( parent, inline_template, request_id=request_id, metadata=[('job_name', job_name)]) # captures operation name for the execution's metadata along with other request parameters metadata = { 'operation_name': response.operation.name, 'template_name': template_name, 'cluster_name': cluster_name } print('workflow instance created, request id {}, operation\'s name: {}'. format(request_id, metadata['operation_name'])) # Sets the future to be called when the workflow execution completes. # This partial function gets populated with local information, normally # not present at the callback execution time, to enrich logging and event # results propagation. response.add_done_callback(partial(execution_callback, metadata=metadata))