def dataproc_submit( self, job_params: SparkJobParameters ) -> Tuple[Job, Callable[[], Job], Callable[[], None]]: local_job_id = str(uuid.uuid4()) main_file_uri = self._stage_file(job_params.get_main_file_path(), local_job_id) job_config: Dict[str, Any] = { "reference": { "job_id": local_job_id }, "placement": { "cluster_name": self.cluster_name }, "labels": { self.JOB_TYPE_LABEL_KEY: job_params.get_job_type().name.lower() }, } # Add job hash to labels only for the stream ingestion job if isinstance(job_params, StreamIngestionJobParameters): job_config["labels"][ self.JOB_HASH_LABEL_KEY] = job_params.get_job_hash() if job_params.get_class_name(): job_config.update({ "spark_job": { "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS, "main_class": job_params.get_class_name(), "args": job_params.get_arguments(), "properties": { "spark.yarn.user.classpath.first": "true" }, } }) else: job_config.update({ "pyspark_job": { "main_python_file_uri": main_file_uri, "jar_file_uris": self.EXTERNAL_JARS, "args": job_params.get_arguments(), } }) job = self.job_client.submit_job( request={ "project_id": self.project_id, "region": self.region, "job": job_config, }) refresh_fn = partial( self.job_client.get_job, project_id=self.project_id, region=self.region, job_id=job.reference.job_id, ) cancel_fn = partial(self.dataproc_cancel, job.reference.job_id) return job, refresh_fn, cancel_fn
def dataproc_submit( self, job_params: SparkJobParameters, extra_properties: Dict[str, str] ) -> Tuple[Job, Callable[[], Job], Callable[[], None]]: local_job_id = str(uuid.uuid4()) main_file_uri = self._stage_file(job_params.get_main_file_path(), local_job_id) job_config: Dict[str, Any] = { "reference": { "job_id": local_job_id }, "placement": { "cluster_name": self.cluster_name }, "labels": { self.JOB_TYPE_LABEL_KEY: job_params.get_job_type().name.lower() }, } maven_package_properties = { "spark.jars.packages": ",".join(job_params.get_extra_packages()) } common_properties = { "spark.executor.instances": self.executor_instances, "spark.executor.cores": self.executor_cores, "spark.executor.memory": self.executor_memory, } if isinstance(job_params, StreamIngestionJobParameters): job_config["labels"][ self.FEATURE_TABLE_LABEL_KEY] = _truncate_label( job_params.get_feature_table_name()) # Add job hash to labels only for the stream ingestion job job_config["labels"][ self.JOB_HASH_LABEL_KEY] = job_params.get_job_hash() if isinstance(job_params, BatchIngestionJobParameters): job_config["labels"][ self.FEATURE_TABLE_LABEL_KEY] = _truncate_label( job_params.get_feature_table_name()) if job_params.get_class_name(): scala_job_properties = { "spark.yarn.user.classpath.first": "true", "spark.executor.instances": self.executor_instances, "spark.executor.cores": self.executor_cores, "spark.executor.memory": self.executor_memory, "spark.pyspark.driver.python": "python3.7", "spark.pyspark.python": "python3.7", } job_config.update({ "spark_job": { "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS, "main_class": job_params.get_class_name(), "args": job_params.get_arguments(), "properties": { **scala_job_properties, **common_properties, **maven_package_properties, **extra_properties, }, } }) else: job_config.update({ "pyspark_job": { "main_python_file_uri": main_file_uri, "jar_file_uris": self.EXTERNAL_JARS, "args": job_params.get_arguments(), "properties": { **common_properties, **maven_package_properties, **extra_properties, }, } }) job = self.job_client.submit_job( request={ "project_id": self.project_id, "region": self.region, "job": job_config, }) refresh_fn = partial( self.job_client.get_job, project_id=self.project_id, region=self.region, job_id=job.reference.job_id, ) cancel_fn = partial(self.dataproc_cancel, job.reference.job_id) return job, refresh_fn, cancel_fn