def main(project_id,
         zone,
         cluster_name,
         bucket_name,
         pyspark_file=None,
         create_new_cluster=True,
         global_region=True):

    # [START dataproc_get_client]
    if global_region:
        region = 'global'
        # Use the default gRPC global endpoints.
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient()
        dataproc_job_client = dataproc_v1.JobControllerClient()
    else:
        region = get_region_from_zone(zone)
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))
        job_transport = (
            job_controller_grpc_transport.JobControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
            client_transport)
        dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
    # [END dataproc_get_client]

    try:
        spark_file, spark_filename = get_pyspark_file(pyspark_file)
        if create_new_cluster:
            create_cluster(dataproc_cluster_client, project_id, zone, region,
                           cluster_name)
            wait_for_cluster_creation()
        upload_pyspark_file(project_id, bucket_name, spark_filename,
                            spark_file)

        list_clusters_with_details(dataproc_cluster_client, project_id, region)

        (cluster_id,
         output_bucket) = (get_cluster_id_by_name(dataproc_cluster_client,
                                                  project_id, region,
                                                  cluster_name))

        # [START dataproc_call_submit_pyspark_job]
        job_id = submit_pyspark_job(dataproc_job_client, project_id, region,
                                    cluster_name, bucket_name, spark_filename)
        # [END dataproc_call_submit_pyspark_job]

        wait_for_job(dataproc_job_client, project_id, region, job_id)
        output = download_output(project_id, cluster_id, output_bucket, job_id)
        print('Received job output {}'.format(output))
        return output
    finally:
        if create_new_cluster:
            delete_cluster(dataproc_cluster_client, project_id, region,
                           cluster_name)
            spark_file.close()
def main(project_id, region):

    if region == "global":
        # Use the default gRPC global endpoints.
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient()
    else:
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
            address="{}-dataproc.googleapis.com:443".format(region)
        )
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(client_transport)

    list_clusters(dataproc_cluster_client, project_id, region)
    def test_delete_cluster(self):
        # Setup Expected Response
        expected_response = {}
        expected_response = empty_pb2.Empty(**expected_response)
        operation = operations_pb2.Operation(
            name='operations/test_delete_cluster', done=True)
        operation.response.Pack(expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[operation])
        client = dataproc_v1.ClusterControllerClient(channel=channel)

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        cluster_name = 'clusterName-1018081872'

        response = client.delete_cluster(project_id, region, cluster_name)
        result = response.result()
        assert expected_response == result

        assert len(channel.requests) == 1
        expected_request = clusters_pb2.DeleteClusterRequest(
            project_id=project_id, region=region, cluster_name=cluster_name)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Exemple #4
0
    def post(self, request):
        print("List of clusters initiated ......")
        os.environ[
            "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\t\\keys.json"
        project_id = "deepak-cloud-trail"
        zone = request.POST["zone"]
        region = get_region_from_zone(zone)
        zone_uri = \
        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
            project_id, zone)
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))

        dataproc_client = dataproc_v1.ClusterControllerClient(client_transport)
        cluster_name = request.POST["cluster_name"]
        cluster_data = {
            'project_id': project_id,
            'cluster_name': cluster_name,
            'config': {
                'gce_cluster_config': {
                    'zone_uri': zone_uri
                },
                'master_config': {
                    'num_instances': 1,
                    'machine_type_uri': 'n1-standard-1'
                },
                'worker_config': {
                    'num_instances': 2,
                    'machine_type_uri': 'n1-standard-1'
                }
            }
        }
        cluster = dataproc_client.create_cluster(project_id, region,
                                                 cluster_data)
Exemple #5
0
  def __init__(self, cluster_metadata: MasterURLIdentifier) -> None:
    """Initializes the DataprocClusterManager with properties required
    to interface with the Dataproc ClusterControllerClient.
    """
    self.cluster_metadata = cluster_metadata
    if self.cluster_metadata.region == 'global':
      # The global region is unsupported as it will be eventually deprecated.
      raise ValueError('Clusters in the global region are not supported.')
    elif not self.cluster_metadata.region:
      _LOGGER.warning(
          'No region information was detected, defaulting Dataproc cluster '
          'region to: us-central1.')
      self.cluster_metadata.region = 'us-central1'

    if not self.cluster_metadata.cluster_name:
      self.cluster_metadata.cluster_name = ie.current_env(
      ).clusters.default_cluster_name

    self._cluster_client = dataproc_v1.ClusterControllerClient(
        client_options={
            'api_endpoint': \
            f'{self.cluster_metadata.region}-dataproc.googleapis.com:443'
        })

    if self.cluster_metadata in ie.current_env().clusters.master_urls.inverse:
      self.master_url = ie.current_env().clusters.master_urls.inverse[
          self.cluster_metadata]
      self.dashboard = ie.current_env().clusters.master_urls_to_dashboards[
          self.master_url]
    else:
      self.master_url = None
      self.dashboard = None

    self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions())
    self._staging_directory = None
def create_cluster(project_id, region, cluster_name, create_buckets=None):
    # Create a client with the endpoint set to the desired cluster region.
    cluster_client = dataproc.ClusterControllerClient(
        client_options={
            "api_endpoint": f"{region}-dataproc.googleapis.com:443"
        })

    staging_bucket = None
    tmp_bucket = None
    if create_buckets:
        staging_bucket = create_bucket_if_not_exists(
            project_id, region, f'{cluster_name}-staging')
        tmp_bucket = create_bucket_if_not_exists(project_id, region,
                                                 f'{cluster_name}-tmp')

    # Create the cluster config.
    cluster = create_dataproc_config(project_id, cluster_name, region,
                                     staging_bucket, tmp_bucket)

    # Create the cluster.
    operation = cluster_client.create_cluster(request={
        "project_id": project_id,
        "region": region,
        "cluster": cluster
    })
    result = operation.result()

    # Output a success message.
    print(f"Cluster created successfully: {result.cluster_name}")
Exemple #7
0
def post(request):
    cluster_client = dataproc.ClusterControllerClient(
        client_options={
            'api_endpoint': 'europe-west2-dataproc.googleapis.com:443'
        })
    create_cluster(cluster_client, "bootcamp-bdmlv", "europe-west2-a",
                   "europe-west2", "hive")
    def test_list_clusters(self):
        project_id = os.environ["PROJECT_ID"]

        client = dataproc_v1.ClusterControllerClient()
        project_id_2 = project_id
        region = "global"
        response = client.list_clusters(project_id_2, region)
def setup_teardown():
    storage_client = storage.Client()
    bucket = storage_client.create_bucket(STAGING_BUCKET)
    blob = bucket.blob(JOB_FILE_NAME)
    blob.upload_from_string(SORT_CODE)

    yield

    cluster_client = dataproc.ClusterControllerClient(
        client_options={"api_endpoint": "{}-dataproc.googleapis.com:443".format(REGION)}
    )

    # The quickstart sample deletes the cluster, but if the test fails
    # before cluster deletion occurs, it can be manually deleted here.
    clusters = cluster_client.list_clusters(
        request={"project_id": PROJECT_ID, "region": REGION}
    )

    for cluster in clusters:
        if cluster.cluster_name == CLUSTER_NAME:
            cluster_client.delete_cluster(
                request={
                    "project_id": PROJECT_ID,
                    "region": REGION,
                    "cluster_name": CLUSTER_NAME,
                }
            )

    blob.delete()
    bucket.delete()
Exemple #10
0
def setup_and_teardown_cluster():
    try:
        # Create cluster using cluster client
        cluster_client = dataproc.ClusterControllerClient(
            client_options={
                "api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"
            })

        operation = cluster_client.create_cluster(project_id=PROJECT_ID,
                                                  region=CLUSTER_REGION,
                                                  cluster=CLUSTER_CONFIG)

        # Wait for cluster to provision
        operation.result()

        yield
    finally:
        try:
            # Delete cluster
            operation = cluster_client.delete_cluster(
                project_id=PROJECT_ID,
                region=CLUSTER_REGION,
                cluster_name=DATAPROC_CLUSTER)
            operation.result()
        except NotFound:
            print("Cluster already deleted")
    def test_create_cluster(self):
        # Setup Expected Response
        project_id_2 = 'projectId2939242356'
        cluster_name = 'clusterName-1018081872'
        cluster_uuid = 'clusterUuid-1017854240'
        expected_response = {
            'project_id': project_id_2,
            'cluster_name': cluster_name,
            'cluster_uuid': cluster_uuid
        }
        expected_response = clusters_pb2.Cluster(**expected_response)
        operation = operations_pb2.Operation(
            name='operations/test_create_cluster', done=True)
        operation.response.Pack(expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[operation])
        client = dataproc_v1.ClusterControllerClient(channel=channel)

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        cluster = {}

        response = client.create_cluster(project_id, region, cluster)
        result = response.result()
        assert expected_response == result

        assert len(channel.requests) == 1
        expected_request = clusters_pb2.CreateClusterRequest(
            project_id=project_id, region=region, cluster=cluster)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Exemple #12
0
    def test_get_cluster(self):
        # Setup Expected Response
        project_id_2 = 'projectId2939242356'
        cluster_name_2 = 'clusterName2875867491'
        cluster_uuid = 'clusterUuid-1017854240'
        expected_response = {
            'project_id': project_id_2,
            'cluster_name': cluster_name_2,
            'cluster_uuid': cluster_uuid
        }
        expected_response = clusters_pb2.Cluster(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.ClusterControllerClient()

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        cluster_name = 'clusterName-1018081872'

        response = client.get_cluster(project_id, region, cluster_name)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = clusters_pb2.GetClusterRequest(
            project_id=project_id, region=region, cluster_name=cluster_name)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Exemple #13
0
    def test_diagnose_cluster(self):
        # Setup Expected Response
        expected_response = {}
        expected_response = empty_pb2.Empty(**expected_response)
        operation = operations_pb2.Operation(
            name='operations/test_diagnose_cluster', done=True)
        operation.response.Pack(expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[operation])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.ClusterControllerClient()

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        cluster_name = 'clusterName-1018081872'

        response = client.diagnose_cluster(project_id, region, cluster_name)
        result = response.result()
        assert expected_response == result

        assert len(channel.requests) == 1
        expected_request = clusters_pb2.DiagnoseClusterRequest(
            project_id=project_id, region=region, cluster_name=cluster_name)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Exemple #14
0
    def test_list_clusters(self):
        # Setup Expected Response
        next_page_token = ''
        clusters_element = {}
        clusters = [clusters_element]
        expected_response = {
            'next_page_token': next_page_token,
            'clusters': clusters
        }
        expected_response = clusters_pb2.ListClustersResponse(
            **expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.ClusterControllerClient()

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'

        paged_list_response = client.list_clusters(project_id, region)
        resources = list(paged_list_response)
        assert len(resources) == 1

        assert expected_response.clusters[0] == resources[0]

        assert len(channel.requests) == 1
        expected_request = clusters_pb2.ListClustersRequest(
            project_id=project_id, region=region)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Exemple #15
0
    def test_update_cluster_exception(self):
        # Setup Response
        error = status_pb2.Status()
        operation = operations_pb2.Operation(
            name='operations/test_update_cluster_exception', done=True)
        operation.error.CopyFrom(error)

        # Mock the API response
        channel = ChannelStub(responses=[operation])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.ClusterControllerClient()

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        cluster_name = 'clusterName-1018081872'
        cluster = {}
        update_mask = {}

        response = client.update_cluster(project_id, region, cluster_name,
                                         cluster, update_mask)
        exception = response.exception()
        assert exception.errors[0] == error
def create_cluster(project_id, region, cluster_name):
    """This sample walks a user through creating a Cloud Dataproc cluster
       using the Python client library.

       Args:
           project_id (string): Project to use for creating resources.
           region (string): Region where the resources should live.
           cluster_name (string): Name to use for creating a cluster.
    """

    # Create a client with the endpoint set to the desired cluster region.
    cluster_client = dataproc.ClusterControllerClient(
        client_options={"api_endpoint": f"{region}-dataproc.googleapis.com:443"}
    )

    # Create the cluster config.
    cluster = {
        "project_id": project_id,
        "cluster_name": cluster_name,
        "config": {
            "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-1"},
            "worker_config": {"num_instances": 2, "machine_type_uri": "n1-standard-1"},
        },
    }

    # Create the cluster.
    operation = cluster_client.create_cluster(
        request={"project_id": project_id, "region": region, "cluster": cluster}
    )
    result = operation.result()

    # Output a success message.
    print(f"Cluster created successfully: {result.cluster_name}")
def teardown():
    yield

    client = dataproc.ClusterControllerClient(
        client_options={
            'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
        })
    # Client library function
    client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME)
def post(request):
    cluster_client = dataproc.ClusterControllerClient(
        client_options={
            'api_endpoint': 'europe-west2-dataproc.googleapis.com:443'
        })
    operation = cluster_client.delete_cluster("bootcamp-bdmlv", "europe-west2",
                                              "hive")

    return operation.result()
Exemple #19
0
def dataproc_cluster_client(zone):
    """
    Lazily create a Dataproc ClusterController client to setup or
    tear down dataproc clusters
    """
    region = get_region_from_zone(zone)

    client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
        address="{}-dataproc.googleapis.com:443".format(region))
    return dataproc_v1.ClusterControllerClient(client_transport)
    def test_list_clusters_exception(self):
        channel = ChannelStub(responses=[CustomException()])
        client = dataproc_v1.ClusterControllerClient(channel=channel)

        # Setup request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'

        paged_list_response = client.list_clusters(project_id, region)
        with pytest.raises(CustomException):
            list(paged_list_response)
Exemple #21
0
def hello_pubsub(event, context):
    cluster_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
        address='us-central1-dataproc.googleapis.com:443')
    dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
        cluster_transport)
    project_id = ''
    region = 'us-central1'
    pubsub_message = base64.b64decode(event['data']).decode('utf-8')
    data = json.loads(pubsub_message)
    cluster_name = data['resource']['labels']['cluster_name']
    print(cluster_name + " dataproc cluster created")
Exemple #22
0
def teardown():
    yield

    cluster_client = dataproc.ClusterControllerClient(
        client_options={
            'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
        })
    # Client library function
    operation = cluster_client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME)
    # Wait for cluster to delete
    operation.result()
    def test_get_cluster_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        client = dataproc_v1.ClusterControllerClient(channel=channel)

        # Setup request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        cluster_name = 'clusterName-1018081872'

        with pytest.raises(CustomException):
            client.get_cluster(project_id, region, cluster_name)
    def dataproc_cluster_client(self):
        """
        Lazily create a Dataproc ClusterController client to setup or
        tear down dataproc clusters
        """

        if self._dataproc_cluster_client is None:
            client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address="{}-dataproc.googleapis.com:443".format(self._region))
            self._dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
                client_transport)
        return self._dataproc_cluster_client
def main(event, context):
    '''
    Triggered by a change to a Cloud Storage bucket.
    :param event:
    :param context:
    :return:
    '''

    # Variables
    PROJECT_ID = 'sas-ivnard'
    CLUSTER_NAME = 'score-spark-demo'
    BUCKET_NAME = 'network-spark-migrate'
    REGION = 'europe-west6'
    ZONE = 'europe-west6-b'
    PIP_PACKAGES = "PyYAML==5.3.1 numpy==1.19.4 pandas==1.1.4 pyspark==3.0.1"
    JOB_ID = 'Batch_Model_Score'

    logging.basicConfig(
        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=logging.INFO)

    # Create a client with the endpoint set to the desired cluster region.
    cluster_client = dataproc.ClusterControllerClient(
        client_options={
            "api_endpoint": f"{REGION}-dataproc.googleapis.com:443"
        })

    # Because we upload several files in a loop, the function will be trigger n times where
    # n is the number of files uploaded. We need to create a filter.
    # Notice: I have to investigate more. For demo purposes, it's enough.
    if event[
            'name'] == 'data/ML-MATT-CompetitionQT1920_val_processed.parquet/_SUCCESS':
        logging.info("A new scoring process is starting...")
        logging.info(f"Creating cluster {CLUSTER_NAME}...")
        cluster = create_cluster(PROJECT_ID, CLUSTER_NAME, BUCKET_NAME, REGION,
                                 ZONE, PIP_PACKAGES)
        logging.info(f"Submitting job {JOB_ID}...")
        cluster.add_done_callback(lambda _: submit_score_job(
            PROJECT_ID, CLUSTER_NAME, REGION, JOB_ID))
        while check_job_state(PROJECT_ID, REGION, JOB_ID) != 'state.done':
            logging.info(f"Job {JOB_ID} is running...")
            time.sleep(5)
        logging.info(f"Job {JOB_ID} is done!")
        logging.info(f"Deleting cluster {CLUSTER_NAME}...")
        delete_cluster(PROJECT_ID, CLUSTER_NAME, REGION)
        while check_if_cluster(PROJECT_ID, CLUSTER_NAME, REGION) is not None:
            logging.info(f"Deleting {CLUSTER_NAME}...")
            time.sleep(2)
        logging.info(f"Cluster {CLUSTER_NAME} deleted!")
 def __init__(self, cluster_metadata: ClusterMetadata) -> None:
   """Initializes the DataprocClusterManager with properties required
   to interface with the Dataproc ClusterControllerClient.
   """
   self.cluster_metadata = cluster_metadata
   # Pipelines whose jobs are executed on the cluster.
   self.pipelines = set()
   self._cluster_client = dataproc_v1.ClusterControllerClient(
       client_options={
           'api_endpoint': \
           f'{self.cluster_metadata.region}-dataproc.googleapis.com:443'
       })
   self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions())
   self._staging_directory = None
Exemple #27
0
    def test_list_clusters_exception(self):
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.ClusterControllerClient()

        # Setup request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'

        paged_list_response = client.list_clusters(project_id, region)
        with pytest.raises(CustomException):
            list(paged_list_response)
Exemple #28
0
def delete_cluster():
    # Create a client with the endpoint set to the desired cluster region.
    cluster_client = dataproc.ClusterControllerClient(client_options={
        'api_endpoint':
        '{}-dataproc.googleapis.com:443'.format('europe-west1')
    })

    # Create the cluster.
    operation = cluster_client.delete_cluster('big-data-architecture-ricardo',
                                              'europe-west1', 'dataproc-bda')
    result = operation.result()

    # Output a success message.
    return 'Cluster deleted successfully'
Exemple #29
0
def teardown():
    yield

    cluster_client = dataproc.ClusterControllerClient(
        client_options={
            "api_endpoint": f"{REGION}-dataproc.googleapis.com:443"
        })
    # Client library function
    operation = cluster_client.delete_cluster(request={
        "project_id": PROJECT_ID,
        "region": REGION,
        "cluster_name": CLUSTER_NAME,
    })
    # Wait for cluster to delete
    operation.result()
Exemple #30
0
    def test_get_cluster_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.ClusterControllerClient()

        # Setup request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        cluster_name = 'clusterName-1018081872'

        with pytest.raises(CustomException):
            client.get_cluster(project_id, region, cluster_name)