Exemple #1
0
    def post(self, request):
        print("List of clusters initiated ......")
        os.environ[
            "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\t\\keys.json"
        project_id = "deepak-cloud-trail"
        zone = request.POST["zone"]
        region = get_region_from_zone(zone)
        zone_uri = \
        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
            project_id, zone)
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))

        dataproc_client = dataproc_v1.ClusterControllerClient(client_transport)
        cluster_name = request.POST["cluster_name"]
        cluster_data = {
            'project_id': project_id,
            'cluster_name': cluster_name,
            'config': {
                'gce_cluster_config': {
                    'zone_uri': zone_uri
                },
                'master_config': {
                    'num_instances': 1,
                    'machine_type_uri': 'n1-standard-1'
                },
                'worker_config': {
                    'num_instances': 2,
                    'machine_type_uri': 'n1-standard-1'
                }
            }
        }
        cluster = dataproc_client.create_cluster(project_id, region,
                                                 cluster_data)
def main(project_id,
         zone,
         cluster_name,
         bucket_name,
         pyspark_file=None,
         create_new_cluster=True,
         global_region=True):

    # [START dataproc_get_client]
    if global_region:
        region = 'global'
        # Use the default gRPC global endpoints.
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient()
        dataproc_job_client = dataproc_v1.JobControllerClient()
    else:
        region = get_region_from_zone(zone)
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))
        job_transport = (
            job_controller_grpc_transport.JobControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
            client_transport)
        dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
    # [END dataproc_get_client]

    try:
        spark_file, spark_filename = get_pyspark_file(pyspark_file)
        if create_new_cluster:
            create_cluster(dataproc_cluster_client, project_id, zone, region,
                           cluster_name)
            wait_for_cluster_creation()
        upload_pyspark_file(project_id, bucket_name, spark_filename,
                            spark_file)

        list_clusters_with_details(dataproc_cluster_client, project_id, region)

        (cluster_id,
         output_bucket) = (get_cluster_id_by_name(dataproc_cluster_client,
                                                  project_id, region,
                                                  cluster_name))

        # [START dataproc_call_submit_pyspark_job]
        job_id = submit_pyspark_job(dataproc_job_client, project_id, region,
                                    cluster_name, bucket_name, spark_filename)
        # [END dataproc_call_submit_pyspark_job]

        wait_for_job(dataproc_job_client, project_id, region, job_id)
        output = download_output(project_id, cluster_id, output_bucket, job_id)
        print('Received job output {}'.format(output))
        return output
    finally:
        if create_new_cluster:
            delete_cluster(dataproc_cluster_client, project_id, region,
                           cluster_name)
            spark_file.close()
Exemple #3
0
def dataproc_cluster_client(zone):
    """
    Lazily create a Dataproc ClusterController client to setup or
    tear down dataproc clusters
    """
    region = get_region_from_zone(zone)

    client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
        address="{}-dataproc.googleapis.com:443".format(region))
    return dataproc_v1.ClusterControllerClient(client_transport)
Exemple #4
0
def hello_pubsub(event, context):
    cluster_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
        address='us-central1-dataproc.googleapis.com:443')
    dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
        cluster_transport)
    project_id = ''
    region = 'us-central1'
    pubsub_message = base64.b64decode(event['data']).decode('utf-8')
    data = json.loads(pubsub_message)
    cluster_name = data['resource']['labels']['cluster_name']
    print(cluster_name + " dataproc cluster created")
    def dataproc_cluster_client(self):
        """
        Lazily create a Dataproc ClusterController client to setup or
        tear down dataproc clusters
        """

        if self._dataproc_cluster_client is None:
            client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address="{}-dataproc.googleapis.com:443".format(self._region))
            self._dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
                client_transport)
        return self._dataproc_cluster_client
def main(project_id, region):

    if region == "global":
        # Use the default gRPC global endpoints.
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient()
    else:
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
            address="{}-dataproc.googleapis.com:443".format(region)
        )
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(client_transport)

    list_clusters(dataproc_cluster_client, project_id, region)
Exemple #7
0
    def post(self, request):
        print("List of clusters initiated ......")
        os.environ[
            "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\t\\keys.json"
        region = request.POST["region"]
        project_id = "deepak-cloud-trail"
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))

        dataproc_client = dataproc_v1.ClusterControllerClient(client_transport)

        list_clusters = dataproc_client.list_clusters(project_id, region)
        print(list_clusters)
        for cluster in list_clusters:
            print("$$$$$$$$", cluster.cluster_name)
Exemple #8
0
def set_cluster_clients():
    global dataproc_cluster_client, dataproc_job_client

    if not dataproc_cluster_client or not dataproc_job_client:
        region = os.environ[GCP_REGION]
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address="{}-dataproc.googleapis.com:443".format(region)))
        job_transport = (
            job_controller_grpc_transport.JobControllerGrpcTransport(
                address="{}-dataproc.googleapis.com:443".format(region)))
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
            client_transport)
        dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
    return dataproc_cluster_client, dataproc_job_client
Exemple #9
0
def update_firewall_rule(event, context):
    import base64
    import json
    from google.cloud import dataproc_v1
    from google.cloud.dataproc_v1.gapic.transports import cluster_controller_grpc_transport
    from googleapiclient.discovery import build


    project_id='playground'
    firewall = 'test2'
    region='us-central1'
    zone='us-central1-c'
    ip=[]
    cluster_name=''
    network='global/networks/test'

    compute = build('compute', 'v1')
    cluster_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(address='us-central1-dataproc.googleapis.com:443')
    dataproc_cluster_client = dataproc_v1.ClusterControllerClient(cluster_transport)
    
    pubsub_message = base64.b64decode(event['data']).decode('utf-8')
    data=json.loads(pubsub_message)
    cluster_name=data['resource']['labels']['cluster_name']
    
    cluster = dataproc_cluster_client.get_cluster(project_id, region,cluster_name)
    master_nodes=list(cluster.config.master_config.instance_names)
    worker_nodes=list(cluster.config.worker_config.instance_names)

    result = compute.instances().list(project=project_id, zone=zone).execute()
    for instance in result["items"]:
        if instance['name'] in master_nodes or instance['name'] in worker_nodes:
            ip.append(instance['networkInterfaces'][0]['accessConfigs'][0]['natIP'])

    firewall_body = {'sourceRanges': ip,'allowed':[{'IPProtocol': 'tcp','ports': ['22']}],'network':network}
    request = compute.firewalls().update(project=project_id, firewall=firewall, body=firewall_body)
    response = request.execute()
Exemple #10
0
    def __init__(
        self,
        transport=None,
        channel=None,
        credentials=None,
        client_config=None,
        client_info=None,
        client_options=None,
    ):
        """Constructor.

        Args:
            transport (Union[~.ClusterControllerGrpcTransport,
                    Callable[[~.Credentials, type], ~.ClusterControllerGrpcTransport]): A transport
                instance, responsible for actually making the API calls.
                The default transport uses the gRPC protocol.
                This argument may also be a callable which returns a
                transport instance. Callables will be sent the credentials
                as the first argument and the default transport class as
                the second argument.
            channel (grpc.Channel): DEPRECATED. A ``Channel`` instance
                through which to make calls. This argument is mutually exclusive
                with ``credentials``; providing both will raise an exception.
            credentials (google.auth.credentials.Credentials): The
                authorization credentials to attach to requests. These
                credentials identify this application to the service. If none
                are specified, the client will attempt to ascertain the
                credentials from the environment.
                This argument is mutually exclusive with providing a
                transport instance to ``transport``; doing so will raise
                an exception.
            client_config (dict): DEPRECATED. A dictionary of call options for
                each method. If not specified, the default configuration is used.
            client_info (google.api_core.gapic_v1.client_info.ClientInfo):
                The client info used to send a user-agent string along with
                API requests. If ``None``, then default info will be used.
                Generally, you only need to set this if you're developing
                your own client library.
            client_options (Union[dict, google.api_core.client_options.ClientOptions]):
                Client options used to set user options on the client. API Endpoint
                should be set through client_options.
        """
        # Raise deprecation warnings for things we want to go away.
        if client_config is not None:
            warnings.warn(
                "The `client_config` argument is deprecated.",
                PendingDeprecationWarning,
                stacklevel=2,
            )
        else:
            client_config = cluster_controller_client_config.config

        if channel:
            warnings.warn(
                "The `channel` argument is deprecated; use "
                "`transport` instead.",
                PendingDeprecationWarning,
                stacklevel=2,
            )

        api_endpoint = self.SERVICE_ADDRESS
        if client_options:
            if type(client_options) == dict:
                client_options = google.api_core.client_options.from_dict(
                    client_options)
            if client_options.api_endpoint:
                api_endpoint = client_options.api_endpoint

        # Instantiate the transport.
        # The transport is responsible for handling serialization and
        # deserialization and actually sending data to the service.
        if transport:
            if callable(transport):
                self.transport = transport(
                    credentials=credentials,
                    default_class=cluster_controller_grpc_transport.
                    ClusterControllerGrpcTransport,
                    address=api_endpoint,
                )
            else:
                if credentials:
                    raise ValueError(
                        "Received both a transport instance and "
                        "credentials; these are mutually exclusive.")
                self.transport = transport
        else:
            self.transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address=api_endpoint, channel=channel, credentials=credentials)

        if client_info is None:
            client_info = google.api_core.gapic_v1.client_info.ClientInfo(
                gapic_version=_GAPIC_LIBRARY_VERSION)
        else:
            client_info.gapic_version = _GAPIC_LIBRARY_VERSION
        self._client_info = client_info

        # Parse out the default settings for retry and timeout for each RPC
        # from the client configuration.
        # (Ordinarily, these are the defaults specified in the `*_config.py`
        # file next to this one.)
        self._method_configs = google.api_core.gapic_v1.config.parse_method_configs(
            client_config["interfaces"][self._INTERFACE_NAME])

        # Save a dictionary of cached API call functions.
        # These are the actual callables which invoke the proper
        # transport methods, wrapped with `wrap_method` to add retry,
        # timeout, and the like.
        self._inner_api_calls = {}
def dataproc_get_client(region):

    transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(address="{0}-dataproc.googleapis.com:443".format(region))
    return dataproc_v1.ClusterControllerClient(transport)
import base64
import json
from google.cloud import dataproc_v1
from google.cloud.dataproc_v1.gapic.transports import cluster_controller_grpc_transport
from googleapiclient.discovery import build

project_id = 'playground-s-11-b2c3df'
firewall = 'test2'
region = 'us-central1'
zone = 'us-central1-c'
ip = []
cluster_name = 'cluster-ec21'
network = 'global/networks/test'

compute = build('compute', 'v1')
cluster_transport = cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
    address='us-central1-dataproc.googleapis.com:443')
dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
    cluster_transport)

cluster = dataproc_cluster_client.get_cluster(project_id, region, cluster_name)
master_nodes = list(cluster.config.master_config.instance_names)
worker_nodes = list(cluster.config.worker_config.instance_names)

result = compute.instances().list(project=project_id, zone=zone).execute()
for instance in result["items"]:
    if instance['name'] in master_nodes or instance['name'] in worker_nodes:
        ip.append(
            instance['networkInterfaces'][0]['accessConfigs'][0]['natIP'])

firewall_body = {
    'sourceRanges': ip,
Exemple #13
0
def trigger_dataproc_jobs(message, context):
    """ Entry point for the CloudFunction
    Captures a Pubsub message from the configured source topic and constructs a
    Dataproc Inline Workflow request to run the jobs specified in the message request.

    message: the Pubsub message
    context: the Cloud Function context information
    """

    if not 'data' in message:
        print("no data in the Pubsub message, nothing to do...")
        return
    event = json.loads(base64.b64decode(message['data']).decode('utf-8'))

    if not "jobs" in event.keys():
        print("jobs property not present in the event, no work to be done...")
        return

    # initialize needed GCP clients
    wf_client_transport = (
        workflow_template_service_grpc_transport.
        WorkflowTemplateServiceGrpcTransport(
            address="{}-dataproc.googleapis.com:443".format(region_id)))
    dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient(
        wf_client_transport)
    dp_client_transport = (
        cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
            address='{}-dataproc.googleapis.com:443'.format(region_id)))
    dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
        dp_client_transport)
    storage_client = storage.Client()

    # retrieves the cloud function configuration for storage
    config = retrieve_configuration(storage_client)

    # build parent region path for dataproc api requests
    parent = dataproc_workflow_client.region_path(project_id, region_id)

    # extract events parameters
    zone = event.get('zone', zone_id)
    job_name = event.get('job_name', 'dataproc-workflow-test')
    template_name = "projects/{}/regions/{}/workflowTemplates/{}".format(
        project_id, region_id, job_name)
    cluster_name = 'cluster-' + job_name
    cluster_init_actions = event.get('cluster_init_actions', [])
    request_id = event.get('request_id', template_name.replace('/', '_'))
    job_labels = event.get('labels', {})
    job_labels['job_name'] = job_name
    job_labels['request_id'] = request_id
    req_metadata = event.get('metadata', {})

    # lets check if there is another cluster with the same labels already running
    # randomizing the wait time we can improve the chances of catching duplicated requests
    time.sleep(random.randint(1, 5))
    for cluster in dataproc_cluster_client.list_clusters(
            project_id, region_id,
            'labels.job_name = {} AND labels.request_id = {}'.format(
                job_name, request_id)):
        print(
            "workflow instance already running for same pair job_name and request_id ({},{}), exiting"
            .format(job_name, request_id))
        return

    if not isinstance(cluster_init_actions, list):
        print("cluster initialization actions should be a list")
        return

    # check on the functions configuration for an entry for the job name in the execution request
    cluster_config = None
    if not job_name in config.keys():
        # if no particular configuration exists use default one
        cluster_config = config['default_cluster_config']
    else:
        cluster_config = config[job_name]

    cluster_config['labels'] = {**cluster_config['labels'], **job_labels}
    cluster_config['cluster_name'] = cluster_name
    cluster_config['config']['gce_cluster_config']['metadata'] = {
        **cluster_config['config']['gce_cluster_config']['metadata'],
        **req_metadata
    }
    cluster_config['config']['gce_cluster_config']['zone_uri'] = zone
    cluster_config['config']['initialization_actions'] = cluster_config[
        'config']['initialization_actions'] + cluster_init_actions
    for action in cluster_config['config']['initialization_actions']:
        if 'execution_timeout' in action:
            timeout = Duration(seconds=action['execution_timeout'])
            action['execution_timeout'] = timeout

    # creates inline template request
    inline_template = {
        'name': template_name,
        'placement': {
            'managed_cluster': cluster_config
        },
        'jobs': event['jobs']
    }

    # sends the request to instantiate the workflow inlined template
    response = dataproc_workflow_client.instantiate_inline_workflow_template(
        parent,
        inline_template,
        request_id=request_id,
        metadata=[('job_name', job_name)])

    # captures operation name for the execution's metadata along with other request parameters
    metadata = {
        'operation_name': response.operation.name,
        'template_name': template_name,
        'cluster_name': cluster_name
    }

    print('workflow instance created, request id {}, operation\'s name: {}'.
          format(request_id, metadata['operation_name']))

    # Sets the future to be called when the workflow execution completes.
    # This partial function gets populated with local information, normally
    # not present at the callback execution time, to enrich logging and event
    # results propagation.
    response.add_done_callback(partial(execution_callback, metadata=metadata))