def http_request(request): """Responds to any HTTP request. Args: request (flask.Request): HTTP request object. Returns: The response text or any set of values that can be turned into a Response object using `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`. """ region = "europe-west1" project = "big-data-keepcoding" cluster_name = "kc-airbnb-cluster" create_cluster(project, region, cluster_name) job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_job_client = dataproc.JobControllerClient(job_transport) job_id = submit_job(dataproc_job_client, project, region, cluster_name, "kc-airbnb", "sql/load_data.sql") wait_for_job(dataproc_job_client, project, region, job_id) job_id = submit_job(dataproc_job_client, project, region, cluster_name, "kc-airbnb", "sql/compute_recommendations.sql") wait_for_job(dataproc_job_client, project, region, job_id) send_message() delete_cluster(project, region, cluster_name) return 'OK'
def main(project_id, zone, cluster_name, bucket_name, pyspark_file=None, create_new_cluster=True, global_region=True): # [START dataproc_get_client] if global_region: region = 'global' # Use the default gRPC global endpoints. dataproc_cluster_client = dataproc_v1.ClusterControllerClient() dataproc_job_client = dataproc_v1.JobControllerClient() else: region = get_region_from_zone(zone) # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) # [END dataproc_get_client] try: spark_file, spark_filename = get_pyspark_file(pyspark_file) if create_new_cluster: create_cluster(dataproc_cluster_client, project_id, zone, region, cluster_name) wait_for_cluster_creation() upload_pyspark_file(project_id, bucket_name, spark_filename, spark_file) list_clusters_with_details(dataproc_cluster_client, project_id, region) (cluster_id, output_bucket) = (get_cluster_id_by_name(dataproc_cluster_client, project_id, region, cluster_name)) # [START dataproc_call_submit_pyspark_job] job_id = submit_pyspark_job(dataproc_job_client, project_id, region, cluster_name, bucket_name, spark_filename) # [END dataproc_call_submit_pyspark_job] wait_for_job(dataproc_job_client, project_id, region, job_id) output = download_output(project_id, cluster_id, output_bucket, job_id) print('Received job output {}'.format(output)) return output finally: if create_new_cluster: delete_cluster(dataproc_cluster_client, project_id, region, cluster_name) spark_file.close()
def dataproc_job_client(self): """ Lazily obtain a GCP Dataproc JobController client """ if self._dataproc_job_client is None: job_transport = job_controller_grpc_transport.JobControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(self._region)) self._dataproc_job_client = dataproc_v1.JobControllerClient( job_transport) return self._dataproc_job_client
def submit_job(): job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(address='{}-dataproc.googleapis.com:443'.format('europe-west1'))) job_details = { 'placement': { 'cluster_name': 'dataproc-bda' }, 'hive_job': { 'query_file_uri': 'gs://{}/{}'.format('bda5-keepcoding-ricardo1', 'scripts/query_lat.txt') } } dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) result = dataproc_job_client.submit_job(project_id='big-data-architecture-ricardo', region='europe-west1', job=job_details) job_id = result.reference.job_id print('Submitted job ID {}.'.format(job_id))
def set_cluster_clients(): global dataproc_cluster_client, dataproc_job_client if not dataproc_cluster_client or not dataproc_job_client: region = os.environ[GCP_REGION] # Use a regional gRPC endpoint. See: # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints client_transport = ( cluster_controller_grpc_transport.ClusterControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region))) job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address="{}-dataproc.googleapis.com:443".format(region))) dataproc_cluster_client = dataproc_v1.ClusterControllerClient( client_transport) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) return dataproc_cluster_client, dataproc_job_client
def loaded(): if request.method == 'POST': # Get folder fileArr = [] folder = [] files = request.files.getlist('file') for f in files: fileArr.append(f.filename) folder = fileArr[0].split("/") # Dataproc API transport = job_controller_grpc_transport.JobControllerGrpcTransport( address='us-west1-dataproc.googleapis.com:443') project_id = 'imperial-sphere-273422' region = 'us-west1' # Define Job arguments: job_args = [] job_args.append( 'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/' + folder[0]) job_args.append( 'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/new') job_client = dataproc_v1.JobControllerClient(transport) # Create Hadoop Job hadoop_job = dataproc_v1.types.HadoopJob(jar_file_uris=[ 'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/JAR/invertedindex.jar' ], main_class='InvertedIndex', args=job_args) # Define Remote cluster to send Job job_placement = dataproc_v1.types.JobPlacement() job_placement.cluster_name = 'cluster-f010' # Define Job configuration main_job = dataproc_v1.types.Job(hadoop_job=hadoop_job, placement=job_placement) # Send job result = job_client.submit_job(project_id, region, main_job) job_id = result.reference.job_id """Wait for job to complete or error out.""" while True: job = job_client.get_job(project_id, region, job_id) if job.status.State.Name(job.status.state) == 'DONE': return render_template("loaded.html") return render_template("loaded.html")
def __init__(self, bucket, zone, cluster, project_id, platform, job_path='jobs-root', use_cloud_engine_credentials=False): self.__bucket = bucket self.__jobs_path = job_path self.__zone = zone self.__cluster = cluster self.__project_id = project_id self.__region = None self.__cluster_uuid = None self.__platform = platform if self.__platform == 'GCP': if self.__zone == 'global': self.__region = self.__zone else: self.__region = self.get_region_from_zone(self.__zone) credentials = None if use_cloud_engine_credentials: credentials = compute_engine.Credentials() if cluster is None and job_path is None: self._cloudml = discovery.build('ml', 'v1', credentials=credentials) else: if self.zone == 'global': self._dataproc_job_client = dataproc_v1.JobControllerClient(credentials=credentials) else: job_transport = ( job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(self.__region), credentials=credentials)) self._dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) else: self._session = boto3.Session() self._sm_session = sagemaker.Session() if not use_cloud_engine_credentials: self._role = sagemaker.get_execution_role() else: self._role = use_cloud_engine_credentials
def __init__(self, transport=None, channel=None, credentials=None, client_config=job_controller_client_config.config, client_info=None): """Constructor. Args: transport (Union[~.JobControllerGrpcTransport, Callable[[~.Credentials, type], ~.JobControllerGrpcTransport]): A transport instance, responsible for actually making the API calls. The default transport uses the gRPC protocol. This argument may also be a callable which returns a transport instance. Callables will be sent the credentials as the first argument and the default transport class as the second argument. channel (grpc.Channel): DEPRECATED. A ``Channel`` instance through which to make calls. This argument is mutually exclusive with ``credentials``; providing both will raise an exception. credentials (google.auth.credentials.Credentials): The authorization credentials to attach to requests. These credentials identify this application to the service. If none are specified, the client will attempt to ascertain the credentials from the environment. This argument is mutually exclusive with providing a transport instance to ``transport``; doing so will raise an exception. client_config (dict): DEPRECATED. A dictionary of call options for each method. If not specified, the default configuration is used. client_info (google.api_core.gapic_v1.client_info.ClientInfo): The client info used to send a user-agent string along with API requests. If ``None``, then default info will be used. Generally, you only need to set this if you're developing your own client library. """ # Raise deprecation warnings for things we want to go away. if client_config: warnings.warn('The `client_config` argument is deprecated.', PendingDeprecationWarning) if channel: warnings.warn( 'The `channel` argument is deprecated; use ' '`transport` instead.', PendingDeprecationWarning) # Instantiate the transport. # The transport is responsible for handling serialization and # deserialization and actually sending data to the service. if transport: if callable(transport): self.transport = transport( credentials=credentials, default_class=job_controller_grpc_transport. JobControllerGrpcTransport, ) else: if credentials: raise ValueError( 'Received both a transport instance and ' 'credentials; these are mutually exclusive.') self.transport = transport else: self.transport = job_controller_grpc_transport.JobControllerGrpcTransport( address=self.SERVICE_ADDRESS, channel=channel, credentials=credentials, ) if client_info is None: client_info = google.api_core.gapic_v1.client_info.ClientInfo( gapic_version=_GAPIC_LIBRARY_VERSION, ) else: client_info.gapic_version = _GAPIC_LIBRARY_VERSION self._client_info = client_info # Parse out the default settings for retry and timeout for each RPC # from the client configuration. # (Ordinarily, these are the defaults specified in the `*_config.py` # file next to this one.) self._method_configs = google.api_core.gapic_v1.config.parse_method_configs( client_config['interfaces'][self._INTERFACE_NAME], ) # Save a dictionary of cached API call functions. # These are the actual callables which invoke the proper # transport methods, wrapped with `wrap_method` to add retry, # timeout, and the like. self._inner_api_calls = {}
job = dataproc.get_job(project, region, job_id) # Handle exceptions if job.status.State.Name(job.status.state) == 'ERROR': raise Exception(job.status.details) elif job.status.State.Name(job.status.state) == 'DONE': print('Job finished.') return job #ToDo project = 'enter project id' region = 'enter region of cluster' cluster_name = 'enter cluster name' bucket_name = 'enter bucket name' job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) ##GUI Code import PySimpleGUI as sg sg.change_look_and_feel('Light Blue 2') layout = [[sg.Text('Select Offline to train and Online to Predict')], [ sg.Text('Mode', size=(15, 1)), sg.Drop(values=('Offline', 'Online'), auto_size_text=True) ], [sg.Text('Enter data path')], [sg.Text('File Path:', size=(8, 1)), sg.Input(), sg.FileBrowse()], [sg.Text('Enter data table name')],
def getDataProcClient(region): print("Connecting to DataProc") job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport( address='{}-dataproc.googleapis.com:443'.format(region))) dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) return dataproc_job_client