Example #1
0
    def __get_workflow_client(self, region):
        """Based on the region, returns the dataproc workflow client."""

        if region == 'global':  # use the global configuration
            self.__print("Using global region configuration")
            return dataproc_v1.WorkflowTemplateServiceClient()

        client_transport = wtsgt.WorkflowTemplateServiceGrpcTransport(
            address="{}-dataproc.googleapis.com:443".format(region))
        return dataproc_v1.WorkflowTemplateServiceClient(client_transport)
    def test_create_workflow_template(self):
        # Setup Expected Response
        id_ = "id3355"
        name = "name3373707"
        version = 351608024
        expected_response = {"id": id_, "name": name, "version": version}
        expected_response = workflow_templates_pb2.WorkflowTemplate(
            **expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup Request
        parent = client.region_path("[PROJECT]", "[REGION]")
        template = {}

        response = client.create_workflow_template(parent, template)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = workflow_templates_pb2.CreateWorkflowTemplateRequest(
            parent=parent, template=template)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
    def test_list_workflow_templates(self):
        # Setup Expected Response
        next_page_token = ""
        templates_element = {}
        templates = [templates_element]
        expected_response = {
            "next_page_token": next_page_token,
            "templates": templates
        }
        expected_response = workflow_templates_pb2.ListWorkflowTemplatesResponse(
            **expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup Request
        parent = client.region_path("[PROJECT]", "[REGION]")

        paged_list_response = client.list_workflow_templates(parent)
        resources = list(paged_list_response)
        assert len(resources) == 1

        assert expected_response.templates[0] == resources[0]

        assert len(channel.requests) == 1
        expected_request = workflow_templates_pb2.ListWorkflowTemplatesRequest(
            parent=parent)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
    def test_instantiate_inline_workflow_template_exception(self):
        # Setup Response
        error = status_pb2.Status()
        operation = operations_pb2.Operation(
            name=
            "operations/test_instantiate_inline_workflow_template_exception",
            done=True,
        )
        operation.error.CopyFrom(error)

        # Mock the API response
        channel = ChannelStub(responses=[operation])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup Request
        parent = client.region_path("[PROJECT]", "[REGION]")
        template = {}

        response = client.instantiate_inline_workflow_template(
            parent, template)
        exception = response.exception()
        assert exception.errors[0] == error
    def test_instantiate_inline_workflow_template(self):
        # Setup Expected Response
        expected_response = {}
        expected_response = empty_pb2.Empty(**expected_response)
        operation = operations_pb2.Operation(
            name="operations/test_instantiate_inline_workflow_template",
            done=True)
        operation.response.Pack(expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[operation])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup Request
        parent = client.region_path("[PROJECT]", "[REGION]")
        template = {}

        response = client.instantiate_inline_workflow_template(
            parent, template)
        result = response.result()
        assert expected_response == result

        assert len(channel.requests) == 1
        expected_request = workflow_templates_pb2.InstantiateInlineWorkflowTemplateRequest(
            parent=parent, template=template)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
    def test_get_workflow_template(self):
        # Setup Expected Response
        id_ = "id3355"
        name_2 = "name2-1052831874"
        version = 351608024
        expected_response = {"id": id_, "name": name_2, "version": version}
        expected_response = workflow_templates_pb2.WorkflowTemplate(
            **expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup Request
        name = client.workflow_template_path("[PROJECT]", "[REGION]",
                                             "[WORKFLOW_TEMPLATE]")

        response = client.get_workflow_template(name)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = workflow_templates_pb2.GetWorkflowTemplateRequest(
            name=name)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
def main(
    project_id,
    zone,
    cluster_name,
    bucket_name,
    pyspark_file=None,
    create_new_cluster=True,
    global_region=True,
):

    # [START dataproc_get_workflow_template_client]
    if global_region:
        region = "global"
        # Use the default gRPC global endpoints.
        dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient()
    else:
        region = get_region_from_zone(zone)
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = workflow_template_service_grpc_transport.WorkflowTemplateServiceGrpcTransport(
            address="{}-dataproc.googleapis.com:443".format(region))
        dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient(
            client_transport)
    # [END dataproc_get_workflow_template_client]

    try:
        spark_file, spark_filename = get_pyspark_file(pyspark_file)
        upload_pyspark_file(project_id, bucket_name, spark_filename,
                            spark_file)

        run_workflow(
            dataproc_workflow_client,
            project_id,
            region,
            zone,
            bucket_name,
            spark_filename,
            cluster_name,
        )
        wait_for_workflow_end()

    finally:
        spark_file.close()
def instantiate_inline_workflow_template(project_id, region):
    """This sample walks a user through submitting a workflow
       for a Cloud Dataproc using the Python client library.

       Args:
           project_id (string): Project to use for running the workflow.
           region (string): Region where the workflow resources should live.
    """

    # Create a client with the endpoint set to the desired region.
    workflow_template_client = dataproc.WorkflowTemplateServiceClient(
        client_options={
            'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
        })

    parent = workflow_template_client.region_path(project_id, region)

    template = {
        'jobs': [{
            'hadoop_job': {
                'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/'
                'hadoop-mapreduce-examples.jar',
                'args': ['teragen', '1000', 'hdfs:///gen/']
            },
            'step_id': 'teragen'
        }, {
            'hadoop_job': {
                'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/'
                'hadoop-mapreduce-examples.jar',
                'args': ['terasort', 'hdfs:///gen/', 'hdfs:///sort/']
            },
            'step_id': 'terasort',
            'prerequisite_step_ids': ['teragen']
        }],
        'placement': {
            'managed_cluster': {
                'cluster_name': 'my-managed-cluster',
                'config': {
                    'gce_cluster_config': {
                        # Leave 'zone_uri' empty for 'Auto Zone Placement'
                        # 'zone_uri': ''
                        'zone_uri': 'us-central1-a'
                    }
                }
            }
        }
    }

    # Submit the request to instantiate the workflow from an inline template.
    operation = workflow_template_client.instantiate_inline_workflow_template(
        parent, template)
    operation.result()

    # Output a success message.
    print('Workflow ran successfully.')
    def test_list_workflow_templates_exception(self):
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup request
        parent = client.region_path("[PROJECT]", "[REGION]")

        paged_list_response = client.list_workflow_templates(parent)
        with pytest.raises(CustomException):
            list(paged_list_response)
    def test_update_workflow_template_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup request
        template = {}

        with pytest.raises(CustomException):
            client.update_workflow_template(template)
    def test_delete_workflow_template_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup request
        name = client.workflow_template_path("[PROJECT]", "[REGION]",
                                             "[WORKFLOW_TEMPLATE]")

        with pytest.raises(CustomException):
            client.delete_workflow_template(name)
    def test_delete_workflow_template(self):
        channel = ChannelStub()
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.WorkflowTemplateServiceClient()

        # Setup Request
        name = client.workflow_template_path("[PROJECT]", "[REGION]",
                                             "[WORKFLOW_TEMPLATE]")

        client.delete_workflow_template(name)

        assert len(channel.requests) == 1
        expected_request = workflow_templates_pb2.DeleteWorkflowTemplateRequest(
            name=name)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Example #13
0
def instantiate_inline_workflow_template(project_id, region):
    """This sample walks a user through submitting a workflow
       for a Cloud Dataproc using the Python client library.

       Args:
           project_id (string): Project to use for running the workflow.
           region (string): Region where the workflow resources should live.
    """

    # Create a client with the endpoint set to the desired region.
    workflow_template_client = dataproc.WorkflowTemplateServiceClient(
        client_options={
            "api_endpoint": f"{region}-dataproc.googleapis.com:443"
        })

    parent = "projects/{}/regions/{}".format(project_id, region)

    template = {
        "jobs": [
            {
                "hadoop_job": {
                    "main_jar_file_uri": "file:///usr/lib/hadoop-mapreduce/"
                    "hadoop-mapreduce-examples.jar",
                    "args": ["teragen", "1000", "hdfs:///gen/"],
                },
                "step_id": "teragen",
            },
            {
                "hadoop_job": {
                    "main_jar_file_uri": "file:///usr/lib/hadoop-mapreduce/"
                    "hadoop-mapreduce-examples.jar",
                    "args": ["terasort", "hdfs:///gen/", "hdfs:///sort/"],
                },
                "step_id": "terasort",
                "prerequisite_step_ids": ["teragen"],
            },
        ],
        "placement": {
            "managed_cluster": {
                "cluster_name": "my-managed-cluster",
                "config": {
                    "gce_cluster_config": {
                        # Leave 'zone_uri' empty for 'Auto Zone Placement'
                        # 'zone_uri': ''
                        "zone_uri": "us-central1-a"
                    }
                },
            }
        },
    }

    # Submit the request to instantiate the workflow from an inline template.
    operation = workflow_template_client.instantiate_inline_workflow_template(
        request={
            "parent": parent,
            "template": template
        })
    operation.result()

    # Output a success message.
    print("Workflow ran successfully.")
Example #14
0
def trigger_dataproc_jobs(message, context):
    """ Entry point for the CloudFunction
    Captures a Pubsub message from the configured source topic and constructs a
    Dataproc Inline Workflow request to run the jobs specified in the message request.

    message: the Pubsub message
    context: the Cloud Function context information
    """

    if not 'data' in message:
        print("no data in the Pubsub message, nothing to do...")
        return
    event = json.loads(base64.b64decode(message['data']).decode('utf-8'))

    if not "jobs" in event.keys():
        print("jobs property not present in the event, no work to be done...")
        return

    # initialize needed GCP clients
    wf_client_transport = (
        workflow_template_service_grpc_transport.
        WorkflowTemplateServiceGrpcTransport(
            address="{}-dataproc.googleapis.com:443".format(region_id)))
    dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient(
        wf_client_transport)
    dp_client_transport = (
        cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
            address='{}-dataproc.googleapis.com:443'.format(region_id)))
    dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
        dp_client_transport)
    storage_client = storage.Client()

    # retrieves the cloud function configuration for storage
    config = retrieve_configuration(storage_client)

    # build parent region path for dataproc api requests
    parent = dataproc_workflow_client.region_path(project_id, region_id)

    # extract events parameters
    zone = event.get('zone', zone_id)
    job_name = event.get('job_name', 'dataproc-workflow-test')
    template_name = "projects/{}/regions/{}/workflowTemplates/{}".format(
        project_id, region_id, job_name)
    cluster_name = 'cluster-' + job_name
    cluster_init_actions = event.get('cluster_init_actions', [])
    request_id = event.get('request_id', template_name.replace('/', '_'))
    job_labels = event.get('labels', {})
    job_labels['job_name'] = job_name
    job_labels['request_id'] = request_id
    req_metadata = event.get('metadata', {})

    # lets check if there is another cluster with the same labels already running
    # randomizing the wait time we can improve the chances of catching duplicated requests
    time.sleep(random.randint(1, 5))
    for cluster in dataproc_cluster_client.list_clusters(
            project_id, region_id,
            'labels.job_name = {} AND labels.request_id = {}'.format(
                job_name, request_id)):
        print(
            "workflow instance already running for same pair job_name and request_id ({},{}), exiting"
            .format(job_name, request_id))
        return

    if not isinstance(cluster_init_actions, list):
        print("cluster initialization actions should be a list")
        return

    # check on the functions configuration for an entry for the job name in the execution request
    cluster_config = None
    if not job_name in config.keys():
        # if no particular configuration exists use default one
        cluster_config = config['default_cluster_config']
    else:
        cluster_config = config[job_name]

    cluster_config['labels'] = {**cluster_config['labels'], **job_labels}
    cluster_config['cluster_name'] = cluster_name
    cluster_config['config']['gce_cluster_config']['metadata'] = {
        **cluster_config['config']['gce_cluster_config']['metadata'],
        **req_metadata
    }
    cluster_config['config']['gce_cluster_config']['zone_uri'] = zone
    cluster_config['config']['initialization_actions'] = cluster_config[
        'config']['initialization_actions'] + cluster_init_actions
    for action in cluster_config['config']['initialization_actions']:
        if 'execution_timeout' in action:
            timeout = Duration(seconds=action['execution_timeout'])
            action['execution_timeout'] = timeout

    # creates inline template request
    inline_template = {
        'name': template_name,
        'placement': {
            'managed_cluster': cluster_config
        },
        'jobs': event['jobs']
    }

    # sends the request to instantiate the workflow inlined template
    response = dataproc_workflow_client.instantiate_inline_workflow_template(
        parent,
        inline_template,
        request_id=request_id,
        metadata=[('job_name', job_name)])

    # captures operation name for the execution's metadata along with other request parameters
    metadata = {
        'operation_name': response.operation.name,
        'template_name': template_name,
        'cluster_name': cluster_name
    }

    print('workflow instance created, request id {}, operation\'s name: {}'.
          format(request_id, metadata['operation_name']))

    # Sets the future to be called when the workflow execution completes.
    # This partial function gets populated with local information, normally
    # not present at the callback execution time, to enrich logging and event
    # results propagation.
    response.add_done_callback(partial(execution_callback, metadata=metadata))