def http_request(request):
    """Responds to any HTTP request.
    Args:
        request (flask.Request): HTTP request object.
    Returns:
        The response text or any set of values that can be turned into a
        Response object using
        `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
    """
    region = "europe-west1"
    project = "big-data-keepcoding"
    cluster_name = "kc-airbnb-cluster"

    create_cluster(project, region, cluster_name)

    job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(
        address='{}-dataproc.googleapis.com:443'.format(region)))
    dataproc_job_client = dataproc.JobControllerClient(job_transport)

    job_id = submit_job(dataproc_job_client, project, region, cluster_name,
                        "kc-airbnb", "sql/load_data.sql")

    wait_for_job(dataproc_job_client, project, region, job_id)

    job_id = submit_job(dataproc_job_client, project, region, cluster_name,
                        "kc-airbnb", "sql/compute_recommendations.sql")

    wait_for_job(dataproc_job_client, project, region, job_id)

    send_message()

    delete_cluster(project, region, cluster_name)

    return 'OK'
def main(project_id,
         zone,
         cluster_name,
         bucket_name,
         pyspark_file=None,
         create_new_cluster=True,
         global_region=True):

    # [START dataproc_get_client]
    if global_region:
        region = 'global'
        # Use the default gRPC global endpoints.
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient()
        dataproc_job_client = dataproc_v1.JobControllerClient()
    else:
        region = get_region_from_zone(zone)
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))
        job_transport = (
            job_controller_grpc_transport.JobControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
            client_transport)
        dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
    # [END dataproc_get_client]

    try:
        spark_file, spark_filename = get_pyspark_file(pyspark_file)
        if create_new_cluster:
            create_cluster(dataproc_cluster_client, project_id, zone, region,
                           cluster_name)
            wait_for_cluster_creation()
        upload_pyspark_file(project_id, bucket_name, spark_filename,
                            spark_file)

        list_clusters_with_details(dataproc_cluster_client, project_id, region)

        (cluster_id,
         output_bucket) = (get_cluster_id_by_name(dataproc_cluster_client,
                                                  project_id, region,
                                                  cluster_name))

        # [START dataproc_call_submit_pyspark_job]
        job_id = submit_pyspark_job(dataproc_job_client, project_id, region,
                                    cluster_name, bucket_name, spark_filename)
        # [END dataproc_call_submit_pyspark_job]

        wait_for_job(dataproc_job_client, project_id, region, job_id)
        output = download_output(project_id, cluster_id, output_bucket, job_id)
        print('Received job output {}'.format(output))
        return output
    finally:
        if create_new_cluster:
            delete_cluster(dataproc_cluster_client, project_id, region,
                           cluster_name)
            spark_file.close()
 def dataproc_job_client(self):
     """
     Lazily obtain a GCP Dataproc JobController client
     """
     if self._dataproc_job_client is None:
         job_transport = job_controller_grpc_transport.JobControllerGrpcTransport(
             address="{}-dataproc.googleapis.com:443".format(self._region))
         self._dataproc_job_client = dataproc_v1.JobControllerClient(
             job_transport)
     return self._dataproc_job_client
def submit_job():  
    job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(address='{}-dataproc.googleapis.com:443'.format('europe-west1')))
    job_details = {
        'placement': {
            'cluster_name': 'dataproc-bda'
        },
        'hive_job': {
            'query_file_uri': 'gs://{}/{}'.format('bda5-keepcoding-ricardo1', 'scripts/query_lat.txt')
        }
    }
    dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)

    result = dataproc_job_client.submit_job(project_id='big-data-architecture-ricardo', region='europe-west1', job=job_details)
    job_id = result.reference.job_id
    print('Submitted job ID {}.'.format(job_id))
Exemple #5
0
def set_cluster_clients():
    global dataproc_cluster_client, dataproc_job_client

    if not dataproc_cluster_client or not dataproc_job_client:
        region = os.environ[GCP_REGION]
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address="{}-dataproc.googleapis.com:443".format(region)))
        job_transport = (
            job_controller_grpc_transport.JobControllerGrpcTransport(
                address="{}-dataproc.googleapis.com:443".format(region)))
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
            client_transport)
        dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
    return dataproc_cluster_client, dataproc_job_client
def loaded():
    if request.method == 'POST':
        # Get folder
        fileArr = []
        folder = []
        files = request.files.getlist('file')
        for f in files:
            fileArr.append(f.filename)
        folder = fileArr[0].split("/")

        # Dataproc API
        transport = job_controller_grpc_transport.JobControllerGrpcTransport(
            address='us-west1-dataproc.googleapis.com:443')
        project_id = 'imperial-sphere-273422'
        region = 'us-west1'
        # Define Job arguments:
        job_args = []
        job_args.append(
            'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/' +
            folder[0])
        job_args.append(
            'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/new')
        job_client = dataproc_v1.JobControllerClient(transport)
        # Create Hadoop Job
        hadoop_job = dataproc_v1.types.HadoopJob(jar_file_uris=[
            'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/JAR/invertedindex.jar'
        ],
                                                 main_class='InvertedIndex',
                                                 args=job_args)
        # Define Remote cluster to send Job
        job_placement = dataproc_v1.types.JobPlacement()
        job_placement.cluster_name = 'cluster-f010'
        # Define Job configuration
        main_job = dataproc_v1.types.Job(hadoop_job=hadoop_job,
                                         placement=job_placement)
        # Send job
        result = job_client.submit_job(project_id, region, main_job)
        job_id = result.reference.job_id
        """Wait for job to complete or error out."""
        while True:
            job = job_client.get_job(project_id, region, job_id)
            if job.status.State.Name(job.status.state) == 'DONE':
                return render_template("loaded.html")
    return render_template("loaded.html")
Exemple #7
0
    def __init__(self, bucket, zone, cluster, project_id, platform, job_path='jobs-root',
                 use_cloud_engine_credentials=False):
        self.__bucket = bucket
        self.__jobs_path = job_path
        self.__zone = zone
        self.__cluster = cluster
        self.__project_id = project_id
        self.__region = None
        self.__cluster_uuid = None
        self.__platform = platform

        if self.__platform == 'GCP':
            if self.__zone == 'global':
                self.__region = self.__zone
            else:
                self.__region = self.get_region_from_zone(self.__zone)

            credentials = None
            if use_cloud_engine_credentials:
                credentials = compute_engine.Credentials()
            if cluster is None and job_path is None:
                self._cloudml = discovery.build('ml', 'v1', credentials=credentials)
            else:
                if self.zone == 'global':
                    self._dataproc_job_client = dataproc_v1.JobControllerClient(credentials=credentials)
                else:
                    job_transport = (
                        job_controller_grpc_transport.JobControllerGrpcTransport(
                            address='{}-dataproc.googleapis.com:443'.format(self.__region),
                            credentials=credentials))
                    self._dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
        else:
                self._session = boto3.Session()
                self._sm_session = sagemaker.Session()
                if not use_cloud_engine_credentials:
                    self._role = sagemaker.get_execution_role()
                else:
                    self._role = use_cloud_engine_credentials
Exemple #8
0
    def __init__(self,
                 transport=None,
                 channel=None,
                 credentials=None,
                 client_config=job_controller_client_config.config,
                 client_info=None):
        """Constructor.

        Args:
            transport (Union[~.JobControllerGrpcTransport,
                    Callable[[~.Credentials, type], ~.JobControllerGrpcTransport]): A transport
                instance, responsible for actually making the API calls.
                The default transport uses the gRPC protocol.
                This argument may also be a callable which returns a
                transport instance. Callables will be sent the credentials
                as the first argument and the default transport class as
                the second argument.
            channel (grpc.Channel): DEPRECATED. A ``Channel`` instance
                through which to make calls. This argument is mutually exclusive
                with ``credentials``; providing both will raise an exception.
            credentials (google.auth.credentials.Credentials): The
                authorization credentials to attach to requests. These
                credentials identify this application to the service. If none
                are specified, the client will attempt to ascertain the
                credentials from the environment.
                This argument is mutually exclusive with providing a
                transport instance to ``transport``; doing so will raise
                an exception.
            client_config (dict): DEPRECATED. A dictionary of call options for
                each method. If not specified, the default configuration is used.
            client_info (google.api_core.gapic_v1.client_info.ClientInfo):
                The client info used to send a user-agent string along with
                API requests. If ``None``, then default info will be used.
                Generally, you only need to set this if you're developing
                your own client library.
        """
        # Raise deprecation warnings for things we want to go away.
        if client_config:
            warnings.warn('The `client_config` argument is deprecated.',
                          PendingDeprecationWarning)
        if channel:
            warnings.warn(
                'The `channel` argument is deprecated; use '
                '`transport` instead.', PendingDeprecationWarning)

        # Instantiate the transport.
        # The transport is responsible for handling serialization and
        # deserialization and actually sending data to the service.
        if transport:
            if callable(transport):
                self.transport = transport(
                    credentials=credentials,
                    default_class=job_controller_grpc_transport.
                    JobControllerGrpcTransport,
                )
            else:
                if credentials:
                    raise ValueError(
                        'Received both a transport instance and '
                        'credentials; these are mutually exclusive.')
                self.transport = transport
        else:
            self.transport = job_controller_grpc_transport.JobControllerGrpcTransport(
                address=self.SERVICE_ADDRESS,
                channel=channel,
                credentials=credentials,
            )

        if client_info is None:
            client_info = google.api_core.gapic_v1.client_info.ClientInfo(
                gapic_version=_GAPIC_LIBRARY_VERSION, )
        else:
            client_info.gapic_version = _GAPIC_LIBRARY_VERSION
        self._client_info = client_info

        # Parse out the default settings for retry and timeout for each RPC
        # from the client configuration.
        # (Ordinarily, these are the defaults specified in the `*_config.py`
        # file next to this one.)
        self._method_configs = google.api_core.gapic_v1.config.parse_method_configs(
            client_config['interfaces'][self._INTERFACE_NAME], )

        # Save a dictionary of cached API call functions.
        # These are the actual callables which invoke the proper
        # transport methods, wrapped with `wrap_method` to add retry,
        # timeout, and the like.
        self._inner_api_calls = {}
Exemple #9
0
        job = dataproc.get_job(project, region, job_id)
        # Handle exceptions
        if job.status.State.Name(job.status.state) == 'ERROR':
            raise Exception(job.status.details)
        elif job.status.State.Name(job.status.state) == 'DONE':
            print('Job finished.')
            return job


#ToDo
project = 'enter project id'
region = 'enter region of cluster'
cluster_name = 'enter cluster name'
bucket_name = 'enter bucket name'

job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(
    address='{}-dataproc.googleapis.com:443'.format(region)))
dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)

##GUI Code

import PySimpleGUI as sg

sg.change_look_and_feel('Light Blue 2')
layout = [[sg.Text('Select Offline to train and Online to Predict')],
          [
              sg.Text('Mode', size=(15, 1)),
              sg.Drop(values=('Offline', 'Online'), auto_size_text=True)
          ], [sg.Text('Enter data path')],
          [sg.Text('File Path:', size=(8, 1)),
           sg.Input(),
           sg.FileBrowse()], [sg.Text('Enter data table name')],
def getDataProcClient(region):
    print("Connecting to DataProc")
    job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(
        address='{}-dataproc.googleapis.com:443'.format(region)))
    dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
    return dataproc_job_client