Ejemplo n.º 1
0
    def __init__(
        self,
        cluster_name: str,
        staging_location: str,
        region: str,
        project_id: str,
    ):
        """
        Initialize a dataproc job controller client, used internally for job submission and result
        retrieval.

        Args:
            cluster_name (str):
                Dataproc cluster name.
            staging_location (str):
                GCS directory for the storage of files generated by the launcher, such as the pyspark scripts.
            region (str):
                Dataproc cluster region.
            project_id (str:
                GCP project id for the dataproc cluster.
        """

        self.cluster_name = cluster_name

        scheme, self.staging_bucket, self.remote_path, _, _, _ = urlparse(
            staging_location)
        if scheme != "gs":
            raise ValueError(
                "Only GCS staging location is supported for DataprocLauncher.")
        self.project_id = project_id
        self.region = region
        self.job_client = JobControllerClient(
            client_options={
                "api_endpoint": f"{region}-dataproc.googleapis.com:443"
            })
Ejemplo n.º 2
0
    def submit_pyspark_job(self, main_python_file_uri: str,
                           python_file_uris: List[str]):
        print(f'submit pyspark job started.')
        job_details = {
            'placement': {
                'cluster_name': self.cluster_name
            },
            'pyspark_job': {
                'main_python_file_uri': main_python_file_uri,
                'python_file_uris': python_file_uris
            }
        }

        job_transport: JobControllerGrpcTransport = JobControllerGrpcTransport(
            address='{}-dataproc.googleapis.com:443'.format(self.region),
            credentials=self.dataproc_credentials)
        dataproc_job_client = JobControllerClient(job_transport)

        result = dataproc_job_client.submit_job(project_id=self.project_id,
                                                region=self.region,
                                                job=job_details)
        job_id = result.reference.job_id
        print(f'job {job_id} is submitted.')

        print(f'waiting for job {job_id} to finish...')
        while True:
            time.sleep(1)
            job = dataproc_job_client.get_job(self.project_id, self.region,
                                              job_id)
            if job.status.State.Name(job.status.state) == 'ERROR':
                raise Exception(job.status.details)
            elif job.status.State.Name(job.status.state) == 'DONE':
                print(f'job {job_id} is finished.')
                break
Ejemplo n.º 3
0
    def get_job_client(self, region: Optional[str] = None) -> JobControllerClient:
        """Returns JobControllerClient."""
        client_options = None
        if region and region != 'global':
            client_options = ClientOptions(api_endpoint=f'{region}-dataproc.googleapis.com:443')

        return JobControllerClient(
            credentials=self._get_credentials(), client_info=CLIENT_INFO, client_options=client_options
        )
Ejemplo n.º 4
0
    def __init__(
        self,
        cluster_name: str,
        staging_location: str,
        region: str,
        project_id: str,
        executor_instances: str,
        executor_cores: str,
        executor_memory: str,
        additional_options: Dict[str, str] = None,
    ):
        """
        Initialize a dataproc job controller client, used internally for job submission and result
        retrieval.

        Args:
            cluster_name (str):
                Dataproc cluster name.
            staging_location (str):
                GCS directory for the storage of files generated by the launcher, such as the pyspark scripts.
            region (str):
                Dataproc cluster region.
            project_id (str):
                GCP project id for the dataproc cluster.
            executor_instances (str):
                Number of executor instances for dataproc job.
            executor_cores (str):
                Number of cores for dataproc job.
            executor_memory (str):
                Amount of memory for dataproc job.
            additional_options (Dict[str, str]):
                Additional configuration options for Spark job
        """

        self.cluster_name = cluster_name

        scheme, self.staging_bucket, self.remote_path, _, _, _ = urlparse(
            staging_location)
        if scheme != "gs":
            raise ValueError(
                "Only GCS staging location is supported for DataprocLauncher.")
        self.project_id = project_id
        self.region = region
        self.job_client = JobControllerClient(
            client_options={
                "api_endpoint": f"{region}-dataproc.googleapis.com:443"
            })
        self.executor_instances = executor_instances
        self.executor_cores = executor_cores
        self.executor_memory = executor_memory
        self.additional_options = additional_options or {}
Ejemplo n.º 5
0
    def get_job_client(
        self, region: Optional[str] = None, location: Optional[str] = None
    ) -> JobControllerClient:
        """Returns JobControllerClient."""
        if location is not None:
            warnings.warn(
                "Parameter `location` will be deprecated. "
                "Please provide value through `region` parameter instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            region = location
        client_options = None
        if region and region != 'global':
            client_options = {'api_endpoint': f'{region}-dataproc.googleapis.com:443'}

        return JobControllerClient(
            credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options
        )