def main(project_id,
         zone,
         cluster_name,
         bucket_name,
         pyspark_file=None,
         create_new_cluster=True,
         global_region=True):

    # [START dataproc_get_client]
    if global_region:
        region = 'global'
        # Use the default gRPC global endpoints.
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient()
        dataproc_job_client = dataproc_v1.JobControllerClient()
    else:
        region = get_region_from_zone(zone)
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))
        job_transport = (
            job_controller_grpc_transport.JobControllerGrpcTransport(
                address='{}-dataproc.googleapis.com:443'.format(region)))
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
            client_transport)
        dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
    # [END dataproc_get_client]

    try:
        spark_file, spark_filename = get_pyspark_file(pyspark_file)
        if create_new_cluster:
            create_cluster(dataproc_cluster_client, project_id, zone, region,
                           cluster_name)
            wait_for_cluster_creation()
        upload_pyspark_file(project_id, bucket_name, spark_filename,
                            spark_file)

        list_clusters_with_details(dataproc_cluster_client, project_id, region)

        (cluster_id,
         output_bucket) = (get_cluster_id_by_name(dataproc_cluster_client,
                                                  project_id, region,
                                                  cluster_name))

        # [START dataproc_call_submit_pyspark_job]
        job_id = submit_pyspark_job(dataproc_job_client, project_id, region,
                                    cluster_name, bucket_name, spark_filename)
        # [END dataproc_call_submit_pyspark_job]

        wait_for_job(dataproc_job_client, project_id, region, job_id)
        output = download_output(project_id, cluster_id, output_bucket, job_id)
        print('Received job output {}'.format(output))
        return output
    finally:
        if create_new_cluster:
            delete_cluster(dataproc_cluster_client, project_id, region,
                           cluster_name)
            spark_file.close()
Example #2
0
    def test_submit_job(self):
        # Setup Expected Response
        driver_output_resource_uri = "driverOutputResourceUri-542229086"
        driver_control_files_uri = "driverControlFilesUri207057643"
        job_uuid = "jobUuid-1615012099"
        expected_response = {
            "driver_output_resource_uri": driver_output_resource_uri,
            "driver_control_files_uri": driver_control_files_uri,
            "job_uuid": job_uuid,
        }
        expected_response = jobs_pb2.Job(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.JobControllerClient()

        # Setup Request
        project_id = "projectId-1969970175"
        region = "region-934795532"
        job = {}

        response = client.submit_job(project_id, region, job)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = jobs_pb2.SubmitJobRequest(project_id=project_id,
                                                     region=region,
                                                     job=job)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
    def test_cancel_job(self):
        # Setup Expected Response
        driver_output_resource_uri = 'driverOutputResourceUri-542229086'
        driver_control_files_uri = 'driverControlFilesUri207057643'
        expected_response = {
            'driver_output_resource_uri': driver_output_resource_uri,
            'driver_control_files_uri': driver_control_files_uri
        }
        expected_response = jobs_pb2.Job(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.JobControllerClient()

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        job_id = 'jobId-1154752291'

        response = client.cancel_job(project_id, region, job_id)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = jobs_pb2.CancelJobRequest(project_id=project_id,
                                                     region=region,
                                                     job_id=job_id)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
    def test_list_jobs(self):
        # Setup Expected Response
        next_page_token = ''
        jobs_element = {}
        jobs = [jobs_element]
        expected_response = {'next_page_token': next_page_token, 'jobs': jobs}
        expected_response = jobs_pb2.ListJobsResponse(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.JobControllerClient()

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'

        paged_list_response = client.list_jobs(project_id, region)
        resources = list(paged_list_response)
        assert len(resources) == 1

        assert expected_response.jobs[0] == resources[0]

        assert len(channel.requests) == 1
        expected_request = jobs_pb2.ListJobsRequest(project_id=project_id,
                                                    region=region)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
def http_request(request):
    """Responds to any HTTP request.
    Args:
        request (flask.Request): HTTP request object.
    Returns:
        The response text or any set of values that can be turned into a
        Response object using
        `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
    """
    region = "europe-west1"
    project = "big-data-keepcoding"
    cluster_name = "kc-airbnb-cluster"

    create_cluster(project, region, cluster_name)

    job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(
        address='{}-dataproc.googleapis.com:443'.format(region)))
    dataproc_job_client = dataproc.JobControllerClient(job_transport)

    job_id = submit_job(dataproc_job_client, project, region, cluster_name,
                        "kc-airbnb", "sql/load_data.sql")

    wait_for_job(dataproc_job_client, project, region, job_id)

    job_id = submit_job(dataproc_job_client, project, region, cluster_name,
                        "kc-airbnb", "sql/compute_recommendations.sql")

    wait_for_job(dataproc_job_client, project, region, job_id)

    send_message()

    delete_cluster(project, region, cluster_name)

    return 'OK'
    def test_list_jobs(self):
        # Setup Expected Response
        next_page_token = ''
        jobs_element = {}
        jobs = [jobs_element]
        expected_response = {'next_page_token': next_page_token, 'jobs': jobs}
        expected_response = jobs_pb2.ListJobsResponse(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        client = dataproc_v1.JobControllerClient(channel=channel)

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'

        paged_list_response = client.list_jobs(project_id, region)
        resources = list(paged_list_response)
        assert len(resources) == 1

        assert expected_response.jobs[0] == resources[0]

        assert len(channel.requests) == 1
        expected_request = jobs_pb2.ListJobsRequest(
            project_id=project_id, region=region)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Example #7
0
def submit_train_job (project_id, cluster_name, region, job_id):
    '''
    Submit batch train job
    :param project_id: The name of project to use for creating resources.
    :param cluster_name: The name of cluster
    :param region: The name of the region
    :param job_id: The name of the job
    :return: None
    '''
    # Create the job client.
    job_client = dataproc.JobControllerClient(client_options={
        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
    })

    # Create the job config.
    # gcloud dataproc jobs submit pyspark gs://network-spark-migrate/model/train.py --cluster train-spark-demo
    # --region europe-west6 --files=gs://network-spark-migrate/model/demo-config.yml -- --configfile ./demo-config.yml
    job = {
        'reference': {
            'project_id': project_id,
            'job_id': job_id
        },
        'placement': {
            'cluster_name': cluster_name
        },
        'pyspark_job': {
            'main_python_file_uri': 'gs://network-spark-migrate/model/train.py',
            'file_uris': ['gs://network-spark-migrate/model/demo-config.yml'],
            'args': ['--configfile', './demo-config.yml']
        }
    }

    job_client.submit_job(
        request={"project_id": project_id, "region": region, "job": job}
    )
    def test_submit_job(self):
        # Setup Expected Response
        driver_output_resource_uri = 'driverOutputResourceUri-542229086'
        driver_control_files_uri = 'driverControlFilesUri207057643'
        expected_response = {
            'driver_output_resource_uri': driver_output_resource_uri,
            'driver_control_files_uri': driver_control_files_uri
        }
        expected_response = jobs_pb2.Job(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        client = dataproc_v1.JobControllerClient(channel=channel)

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        job = {}

        response = client.submit_job(project_id, region, job)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = jobs_pb2.SubmitJobRequest(
            project_id=project_id, region=region, job=job)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Example #9
0
    def __init__(
        self,
        cluster_name: str,
        staging_location: str,
        region: str,
        project_id: str,
    ):
        """
        Initialize a dataproc job controller client, used internally for job submission and result
        retrieval.

        Args:
            cluster_name (str):
                Dataproc cluster name.
            staging_location (str):
                GCS directory for the storage of files generated by the launcher, such as the pyspark scripts.
            region (str):
                Dataproc cluster region.
            project_id (str:
                GCP project id for the dataproc cluster.
        """

        self.cluster_name = cluster_name

        scheme, self.staging_bucket, self.remote_path, _, _, _ = urlparse(
            staging_location)
        if scheme != "gs":
            raise ValueError(
                "Only GCS staging location is supported for DataprocLauncher.")
        self.project_id = project_id
        self.region = region
        self.job_client = dataproc_v1.JobControllerClient(
            client_options={
                "api_endpoint": f"{region}-dataproc.googleapis.com:443"
            })
def test_spark_streaming_from_pubsublite(
        subscription: Subscription,
        dataproc_cluster: dataproc_v1.Cluster) -> None:
    # Create a Dataproc job client.
    job_client = dataproc_v1.JobControllerClient(client_options={
        "api_endpoint":
        "{}-dataproc.googleapis.com:443".format(CLOUD_REGION)
    })

    # Create the job config.
    job = {
        # Use the subscription prefix and the first four alphanumeric
        # characters of the UUID as job ID
        "reference": {
            "job_id": subscription.name.split("/")[-1][:-28]
        },
        "placement": {
            "cluster_name": dataproc_cluster.cluster_name
        },
        "pyspark_job": {
            "main_python_file_uri":
            pyfile("spark_streaming_from_pubsublite_example.py"),
            "jar_file_uris": [
                "gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar"
            ],
            "properties": {
                "spark.master": "yarn"
            },
            "logging_config": {
                "driver_log_levels": {
                    "root": LoggingConfig.Level.INFO
                }
            },
            "args": [
                f"--project_number={PROJECT_NUMBER}",
                f"--location={CLOUD_REGION}-{ZONE_ID}",
                f"--subscription_id={SUBSCRIPTION_ID}",
            ],
        },
    }

    operation = job_client.submit_job_as_operation(
        request={
            "project_id": PROJECT_ID,
            "region": CLOUD_REGION,
            "job": job,
            "request_id": "read-" + UUID,
        })
    response = operation.result()

    # Dataproc job output gets saved to the Google Cloud Storage bucket
    # allocated to the job. Use a regex to obtain the bucket and blob info.
    matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri)

    output = (storage.Client().get_bucket(matches.group(1)).blob(
        f"{matches.group(2)}.000000000").download_as_text())

    assert "Batch: 0\n" in output
Example #11
0
def test_clean():
    """Tests clean.py by submitting it to a Dataproc cluster"""
    # Submit job to Dataproc cluster
    job_client = dataproc.JobControllerClient(
        client_options={
            "api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"
        })
    operation = job_client.submit_job_as_operation(project_id=PROJECT_ID,
                                                   region=CLUSTER_REGION,
                                                   job=DATAPROC_JOB)

    # Wait for job to complete
    result = operation.result()

    # Get job output
    output_location = result.driver_output_resource_uri + ".000000000"
    blob = get_blob_from_path(output_location)
    out = blob.download_as_string().decode("utf-8")

    # trip duration
    assert not is_in_table(r"\d*.\d* s", out)
    assert not is_in_table(r"\d*.\d* min", out)
    assert not is_in_table(r"\d*.\d* h", out)

    # station latitude & longitude
    assert not is_in_table(r"\d+" + "\u00B0" + r"\d+\'\d+\"", out)

    assert is_in_table(r"\d*.\d*", out)

    # gender
    assert not is_in_table("M", out)
    assert not is_in_table("m", out)
    assert not is_in_table("male", out)
    assert not is_in_table("MALE", out)
    assert not is_in_table("F", out)
    assert not is_in_table("f", out)
    assert not is_in_table("female", out)
    assert not is_in_table("FEMALE", out)
    assert not is_in_table("U", out)
    assert not is_in_table("u", out)
    assert not is_in_table("unknown", out)
    assert not is_in_table("UNKNOWN", out)

    assert is_in_table("Male", out)
    assert is_in_table("Female", out)

    # customer plan
    assert not is_in_table("subscriber", out)
    assert not is_in_table("SUBSCRIBER", out)
    assert not is_in_table("sub", out)
    assert not is_in_table("customer", out)
    assert not is_in_table("CUSTOMER", out)
    assert not is_in_table("cust", out)

    assert is_in_table("Subscriber", out)
    assert is_in_table("Customer", out)
 def dataproc_job_client(self):
     """
     Lazily obtain a GCP Dataproc JobController client
     """
     if self._dataproc_job_client is None:
         job_transport = job_controller_grpc_transport.JobControllerGrpcTransport(
             address="{}-dataproc.googleapis.com:443".format(self._region))
         self._dataproc_job_client = dataproc_v1.JobControllerClient(
             job_transport)
     return self._dataproc_job_client
    def test_list_jobs_exception(self):
        channel = ChannelStub(responses=[CustomException()])
        client = dataproc_v1.JobControllerClient(channel=channel)

        # Setup request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'

        paged_list_response = client.list_jobs(project_id, region)
        with pytest.raises(CustomException):
            list(paged_list_response)
Example #14
0
def post(request):
    job_client = dataproc.JobControllerClient(
        client_options={
            'api_endpoint': 'europe-west2-dataproc.googleapis.com:443'
        })

    job = request.get_json()

    job_response = job_client.submit_job('bootcamp-bdmlv', 'europe-west2', job)
    job_id = job_response.reference.job_id

    return (f'Submitted job \"{job_id}\".')
    def test_submit_job_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        client = dataproc_v1.JobControllerClient(channel=channel)

        # Setup request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        job = {}

        with pytest.raises(CustomException):
            client.submit_job(project_id, region, job)
Example #16
0
    def __init__(self, bucket, zone, cluster, project_id, platform, job_path='jobs-root',
                 use_cloud_engine_credentials=False):
        self.__bucket = bucket
        self.__jobs_path = job_path
        self.__zone = zone
        self.__cluster = cluster
        self.__project_id = project_id
        self.__region = None
        self.__cluster_uuid = None
        self.__platform = platform

        if self.__platform == 'GCP':
            if self.__zone == 'global':
                self.__region = self.__zone
            else:
                self.__region = self.get_region_from_zone(self.__zone)

            credentials = None
            if use_cloud_engine_credentials:
                credentials = compute_engine.Credentials()
            if cluster is None and job_path is None:
                self._cloudml = discovery.build('ml', 'v1', credentials=credentials)
            else:
                if self.zone == 'global':
                    self._dataproc_job_client = dataproc_v1.JobControllerClient(credentials=credentials)
                else:
                    job_transport = (
                        job_controller_grpc_transport.JobControllerGrpcTransport(
                            address='{}-dataproc.googleapis.com:443'.format(self.__region),
                            credentials=credentials))
                    self._dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
        else:
                self._session = boto3.Session()
                self._sm_session = sagemaker.Session()
                if not use_cloud_engine_credentials:
                    self._role = sagemaker.get_execution_role()
                else:
                    self._role = use_cloud_engine_credentials
    def test_list_jobs_exception(self):
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.JobControllerClient()

        # Setup request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'

        paged_list_response = client.list_jobs(project_id, region)
        with pytest.raises(CustomException):
            list(paged_list_response)
def test_spark_streaming_to_pubsublite(topic: Topic) -> None:
    from google.cloud.dataproc_v1.types import LoggingConfig

    # Create a Dataproc job client.
    job_client = dataproc_v1.JobControllerClient(
        client_options={
            "api_endpoint": "{}-dataproc.googleapis.com:443".format(CLOUD_REGION)
        }
    )

    # Create the job config.
    job = {
        "placement": {"cluster_name": CLUSTER_ID},
        "pyspark_job": {
            "main_python_file_uri": pyfile("spark_streaming_to_pubsublite_example.py"),
            "jar_file_uris": [
                "gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar"
            ],
            "properties": {"spark.master": "yarn"},
            "logging_config": {"driver_log_levels": {"root": LoggingConfig.Level.INFO}},
            "args": [
                f"--project_number={PROJECT_NUMBER}",
                f"--location={CLOUD_REGION}-{ZONE_ID}",
                f"--topic_id={TOPIC_ID}",
            ],
        },
    }

    operation = job_client.submit_job_as_operation(
        request={
            "project_id": PROJECT_ID,
            "region": CLOUD_REGION,
            "job": job,
            "request_id": "write-" + UUID,
        }
    )
    response = operation.result()

    # Dataproc job output gets saved to the Google Cloud Storage bucket
    # allocated to the job. Use a regex to obtain the bucket and blob info.
    matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri)

    output = (
        storage.Client()
        .get_bucket(matches.group(1))
        .blob(f"{matches.group(2)}.000000000")
        .download_as_text()
    )

    assert "Committed 1 messages for epochId" in output
Example #19
0
def submit_job():  
    job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(address='{}-dataproc.googleapis.com:443'.format('europe-west1')))
    job_details = {
        'placement': {
            'cluster_name': 'dataproc-bda'
        },
        'hive_job': {
            'query_file_uri': 'gs://{}/{}'.format('bda5-keepcoding-ricardo1', 'scripts/query_lat.txt')
        }
    }
    dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)

    result = dataproc_job_client.submit_job(project_id='big-data-architecture-ricardo', region='europe-west1', job=job_details)
    job_id = result.reference.job_id
    print('Submitted job ID {}.'.format(job_id))
Example #20
0
    def test_delete_job_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.JobControllerClient()

        # Setup request
        project_id = "projectId-1969970175"
        region = "region-934795532"
        job_id = "jobId-1154752291"

        with pytest.raises(CustomException):
            client.delete_job(project_id, region, job_id)
    def test_submit_job_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.JobControllerClient()

        # Setup request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        job = {}

        with pytest.raises(CustomException):
            client.submit_job(project_id, region, job)
    def test_delete_job(self):
        channel = ChannelStub()
        client = dataproc_v1.JobControllerClient(channel=channel)

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        job_id = 'jobId-1154752291'

        client.delete_job(project_id, region, job_id)

        assert len(channel.requests) == 1
        expected_request = jobs_pb2.DeleteJobRequest(
            project_id=project_id, region=region, job_id=job_id)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Example #23
0
def set_cluster_clients():
    global dataproc_cluster_client, dataproc_job_client

    if not dataproc_cluster_client or not dataproc_job_client:
        region = os.environ[GCP_REGION]
        # Use a regional gRPC endpoint. See:
        # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints
        client_transport = (
            cluster_controller_grpc_transport.ClusterControllerGrpcTransport(
                address="{}-dataproc.googleapis.com:443".format(region)))
        job_transport = (
            job_controller_grpc_transport.JobControllerGrpcTransport(
                address="{}-dataproc.googleapis.com:443".format(region)))
        dataproc_cluster_client = dataproc_v1.ClusterControllerClient(
            client_transport)
        dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)
    return dataproc_cluster_client, dataproc_job_client
Example #24
0
    def execute(self, context: bigflow.JobContext):
        logger.info("Run job %r", self.id)

        job_internal_id = self._generate_internal_jobid(context)

        client_options = {
            'api_endpoint': f"{self.gcp_region}-dataproc.googleapis.com:443"
        }
        storage_client = storage.Client(project=self.gcp_project_id)
        dataproc_job_client = dataproc_v1.JobControllerClient(
            client_options=client_options)

        driver_script = self._prepare_driver_script(context)

        logger.info("Prapare and upload python package...")
        bucket = storage_client.get_bucket(self.bucket_id)

        egg_local_path = str(
            bigflow.build.reflect.build_egg(self._project_pkg_path))
        egg_path = _upload_egg(egg_local_path, bucket, job_internal_id)

        driver_path = f"{job_internal_id}/{self.driver_filename}"
        _upload_driver_script(driver_script, bucket, driver_path)

        with self._with_temp_cluster(job_internal_id) as cluster_name:
            job = _submit_single_pyspark_job(
                dataproc_job_client=dataproc_job_client,
                project_id=self.gcp_project_id,
                region=self.gcp_region,
                cluster_name=cluster_name,
                bucket_id=self.bucket_id,
                jar_file_uris=self.jar_file_uris,
                driver_path=driver_path,
                egg_path=egg_path,
                properties=self._prepare_pyspark_properties(context),
            )
            try:
                _wait_for_job_to_finish(dataproc_job_client,
                                        self.gcp_project_id, self.gcp_region,
                                        job)
            finally:
                _print_job_output_log(storage_client, dataproc_job_client,
                                      self.gcp_project_id, self.gcp_region,
                                      job)

        logger.info("Job %r was finished", self.id)
Example #25
0
def loaded():
    if request.method == 'POST':
        # Get folder
        fileArr = []
        folder = []
        files = request.files.getlist('file')
        for f in files:
            fileArr.append(f.filename)
        folder = fileArr[0].split("/")

        # Dataproc API
        transport = job_controller_grpc_transport.JobControllerGrpcTransport(
            address='us-west1-dataproc.googleapis.com:443')
        project_id = 'imperial-sphere-273422'
        region = 'us-west1'
        # Define Job arguments:
        job_args = []
        job_args.append(
            'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/' +
            folder[0])
        job_args.append(
            'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/new')
        job_client = dataproc_v1.JobControllerClient(transport)
        # Create Hadoop Job
        hadoop_job = dataproc_v1.types.HadoopJob(jar_file_uris=[
            'gs://dataproc-staging-us-west1-628394627960-6e5uyn8v/JAR/invertedindex.jar'
        ],
                                                 main_class='InvertedIndex',
                                                 args=job_args)
        # Define Remote cluster to send Job
        job_placement = dataproc_v1.types.JobPlacement()
        job_placement.cluster_name = 'cluster-f010'
        # Define Job configuration
        main_job = dataproc_v1.types.Job(hadoop_job=hadoop_job,
                                         placement=job_placement)
        # Send job
        result = job_client.submit_job(project_id, region, main_job)
        job_id = result.reference.job_id
        """Wait for job to complete or error out."""
        while True:
            job = job_client.get_job(project_id, region, job_id)
            if job.status.State.Name(job.status.state) == 'DONE':
                return render_template("loaded.html")
    return render_template("loaded.html")
Example #26
0
def check_job_state (project_id, region, job_id):
    '''
    Check Job state
    :param project_id: The name of project to use for creating resources.
    :param region: The name of the region
    :param job_id: The name of the job
    :return: job_state: A string with job state
    '''

    # Create the job client.
    job_client = dataproc.JobControllerClient(client_options={
        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
    })

    job_instance = job_client.get_job(
        request={"project_id": project_id, "region": region, "job_id": job_id}
    )
    job_state = str.lower(str(job_instance.status.state))
    return job_state
    def test_delete_job(self):
        channel = ChannelStub()
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = dataproc_v1.JobControllerClient()

        # Setup Request
        project_id = 'projectId-1969970175'
        region = 'region-934795532'
        job_id = 'jobId-1154752291'

        client.delete_job(project_id, region, job_id)

        assert len(channel.requests) == 1
        expected_request = jobs_pb2.DeleteJobRequest(project_id=project_id,
                                                     region=region,
                                                     job_id=job_id)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Example #28
0
def submit_job(project_id, region, cluster_name):
    # Create the job client.
    job_client = dataproc.JobControllerClient(client_options={
        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
    })

    # Create the job config. 'main_jar_file_uri' can also be a
    # Google Cloud Storage URL.
    job = {
        'placement': {
            'cluster_name': cluster_name
        },
        'spark_job': {
            'main_class': 'org.apache.spark.examples.SparkPi',
            'jar_file_uris': ['file:///usr/lib/spark/examples/jars/spark-examples.jar'],
            'args': ['1000']
        }
    }

    operation = job_client.submit_job_as_operation(
        request={"project_id": project_id, "region": region, "job": job}
    )
    response = operation.result()

    # Dataproc job output gets saved to the Google Cloud Storage bucket
    # allocated to the job. Use a regex to obtain the bucket and blob info.
    matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri)

    output = (
        storage.Client()
        .get_bucket(matches.group(1))
        .blob(f"{matches.group(2)}.000000000")
        .download_as_string()
    )

    print(f"Job finished successfully: {output}")
Example #29
0
        if job.status.State.Name(job.status.state) == 'ERROR':
            raise Exception(job.status.details)
        elif job.status.State.Name(job.status.state) == 'DONE':
            print('Job finished.')
            return job


#ToDo
project = 'enter project id'
region = 'enter region of cluster'
cluster_name = 'enter cluster name'
bucket_name = 'enter bucket name'

job_transport = (job_controller_grpc_transport.JobControllerGrpcTransport(
    address='{}-dataproc.googleapis.com:443'.format(region)))
dataproc_job_client = dataproc_v1.JobControllerClient(job_transport)

##GUI Code

import PySimpleGUI as sg

sg.change_look_and_feel('Light Blue 2')
layout = [[sg.Text('Select Offline to train and Online to Predict')],
          [
              sg.Text('Mode', size=(15, 1)),
              sg.Drop(values=('Offline', 'Online'), auto_size_text=True)
          ], [sg.Text('Enter data path')],
          [sg.Text('File Path:', size=(8, 1)),
           sg.Input(),
           sg.FileBrowse()], [sg.Text('Enter data table name')],
          [sg.Text('TableName:', size=(8, 1)),
def test_spark_streaming_from_pubsublite(subscription: Subscription) -> None:
    from google.cloud.dataproc_v1.types import LoggingConfig

    # Create a Dataproc job client.
    job_client = dataproc_v1.JobControllerClient(
        client_options={
            "api_endpoint": "{}-dataproc.googleapis.com:443".format(CLOUD_REGION)
        }
    )

    # Create the job config.
    job = {
        "placement": {"cluster_name": CLUSTER_ID},
        "pyspark_job": {
            "main_python_file_uri": pyfile(
                "spark_streaming_from_pubsublite_example.py"
            ),
            "jar_file_uris": [
                "gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar"
            ],
            "properties": {"spark.master": "yarn"},
            "logging_config": {"driver_log_levels": {"root": LoggingConfig.Level.INFO}},
            "args": [
                f"--project_number={PROJECT_NUMBER}",
                f"--location={CLOUD_REGION}-{ZONE_ID}",
                f"--subscription_id={SUBSCRIPTION_ID}",
            ],
        },
    }

    operation = job_client.submit_job_as_operation(
        request={
            "project_id": PROJECT_ID,
            "region": CLOUD_REGION,
            "job": job,
            "request_id": "read-" + UUID,
        }
    )
    response = operation.result()

    # Dataproc job output gets saved to the Google Cloud Storage bucket
    # allocated to the job. Use a regex to obtain the bucket and blob info.
    matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri)

    output = (
        storage.Client()
        .get_bucket(matches.group(1))
        .blob(f"{matches.group(2)}.000000000")
        .download_as_text()
    )

    assert "Batch: 0\n" in output
    assert (
        "+--------------------+---------+------+----+------+"
        + "--------------------+--------------------+----------+\n"
        + "|        subscription|partition|offset| key|  data"
        + "|   publish_timestamp|     event_timestamp|attributes|\n"
        + "+--------------------+---------+------+----+------+"
        + "--------------------+--------------------+----------+\n"
        + "|projects/10126164...|        0|     0|[34]|353534"
        + "|2021-09-15 21:55:...|2021-09-15 00:04:...|        []|\n"
        in output
    )