Example #1
0
    def dataproc_submit(
        self, job_params: SparkJobParameters
    ) -> Tuple[Job, Callable[[], Job], Callable[[], None]]:
        local_job_id = str(uuid.uuid4())
        main_file_uri = self._stage_file(job_params.get_main_file_path(),
                                         local_job_id)
        job_config: Dict[str, Any] = {
            "reference": {
                "job_id": local_job_id
            },
            "placement": {
                "cluster_name": self.cluster_name
            },
            "labels": {
                self.JOB_TYPE_LABEL_KEY:
                job_params.get_job_type().name.lower()
            },
        }

        # Add job hash to labels only for the stream ingestion job
        if isinstance(job_params, StreamIngestionJobParameters):
            job_config["labels"][
                self.JOB_HASH_LABEL_KEY] = job_params.get_job_hash()

        if job_params.get_class_name():
            job_config.update({
                "spark_job": {
                    "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS,
                    "main_class": job_params.get_class_name(),
                    "args": job_params.get_arguments(),
                    "properties": {
                        "spark.yarn.user.classpath.first": "true"
                    },
                }
            })
        else:
            job_config.update({
                "pyspark_job": {
                    "main_python_file_uri": main_file_uri,
                    "jar_file_uris": self.EXTERNAL_JARS,
                    "args": job_params.get_arguments(),
                }
            })

        job = self.job_client.submit_job(
            request={
                "project_id": self.project_id,
                "region": self.region,
                "job": job_config,
            })

        refresh_fn = partial(
            self.job_client.get_job,
            project_id=self.project_id,
            region=self.region,
            job_id=job.reference.job_id,
        )
        cancel_fn = partial(self.dataproc_cancel, job.reference.job_id)

        return job, refresh_fn, cancel_fn
Example #2
0
    def dataproc_submit(
        self, job_params: SparkJobParameters, extra_properties: Dict[str, str]
    ) -> Tuple[Job, Callable[[], Job], Callable[[], None]]:
        local_job_id = str(uuid.uuid4())
        main_file_uri = self._stage_file(job_params.get_main_file_path(),
                                         local_job_id)
        job_config: Dict[str, Any] = {
            "reference": {
                "job_id": local_job_id
            },
            "placement": {
                "cluster_name": self.cluster_name
            },
            "labels": {
                self.JOB_TYPE_LABEL_KEY:
                job_params.get_job_type().name.lower()
            },
        }

        maven_package_properties = {
            "spark.jars.packages": ",".join(job_params.get_extra_packages())
        }
        common_properties = {
            "spark.executor.instances": self.executor_instances,
            "spark.executor.cores": self.executor_cores,
            "spark.executor.memory": self.executor_memory,
        }

        if isinstance(job_params, StreamIngestionJobParameters):
            job_config["labels"][
                self.FEATURE_TABLE_LABEL_KEY] = _truncate_label(
                    job_params.get_feature_table_name())
            # Add job hash to labels only for the stream ingestion job
            job_config["labels"][
                self.JOB_HASH_LABEL_KEY] = job_params.get_job_hash()

        if isinstance(job_params, BatchIngestionJobParameters):
            job_config["labels"][
                self.FEATURE_TABLE_LABEL_KEY] = _truncate_label(
                    job_params.get_feature_table_name())

        if job_params.get_class_name():
            scala_job_properties = {
                "spark.yarn.user.classpath.first": "true",
                "spark.executor.instances": self.executor_instances,
                "spark.executor.cores": self.executor_cores,
                "spark.executor.memory": self.executor_memory,
                "spark.pyspark.driver.python": "python3.7",
                "spark.pyspark.python": "python3.7",
            }

            job_config.update({
                "spark_job": {
                    "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS,
                    "main_class": job_params.get_class_name(),
                    "args": job_params.get_arguments(),
                    "properties": {
                        **scala_job_properties,
                        **common_properties,
                        **maven_package_properties,
                        **extra_properties,
                    },
                }
            })
        else:
            job_config.update({
                "pyspark_job": {
                    "main_python_file_uri": main_file_uri,
                    "jar_file_uris": self.EXTERNAL_JARS,
                    "args": job_params.get_arguments(),
                    "properties": {
                        **common_properties,
                        **maven_package_properties,
                        **extra_properties,
                    },
                }
            })

        job = self.job_client.submit_job(
            request={
                "project_id": self.project_id,
                "region": self.region,
                "job": job_config,
            })

        refresh_fn = partial(
            self.job_client.get_job,
            project_id=self.project_id,
            region=self.region,
            job_id=job.reference.job_id,
        )
        cancel_fn = partial(self.dataproc_cancel, job.reference.job_id)

        return job, refresh_fn, cancel_fn