Ejemplo n.º 1
0
 def dataproc_submit(self, job_params: SparkJobParameters) -> Operation:
     local_job_id = str(uuid.uuid4())
     main_file_uri = self._stage_files(job_params.get_main_file_path(),
                                       local_job_id)
     job_config: Dict[str, Any] = {
         "reference": {
             "job_id": local_job_id
         },
         "placement": {
             "cluster_name": self.cluster_name
         },
     }
     if job_params.get_class_name():
         job_config.update({
             "spark_job": {
                 "jar_file_uris": [main_file_uri],
                 "main_class": job_params.get_class_name(),
                 "args": job_params.get_arguments(),
             }
         })
     else:
         job_config.update({
             "pyspark_job": {
                 "main_python_file_uri": main_file_uri,
                 "args": job_params.get_arguments(),
             }
         })
     return self.job_client.submit_job_as_operation(
         request={
             "project_id": self.project_id,
             "region": self.region,
             "job": job_config,
         })
Ejemplo n.º 2
0
    def spark_submit(self,
                     job_params: SparkJobParameters,
                     ui_port: int = None) -> subprocess.Popen:
        submission_cmd = [
            self.spark_submit_script_path,
            "--master",
            self.master_url,
            "--name",
            job_params.get_name(),
        ]

        if job_params.get_class_name():
            submission_cmd.extend(["--class", job_params.get_class_name()])

        if ui_port:
            submission_cmd.extend(["--conf", f"spark.ui.port={ui_port}"])

        # Workaround for https://github.com/apache/spark/pull/26552
        # Fix running spark job with bigquery connector (w/ shadowing) on JDK 9+
        submission_cmd.extend([
            "--conf",
            "spark.executor.extraJavaOptions="
            "-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true -Duser.timezone=GMT",
            "--conf",
            "spark.driver.extraJavaOptions="
            "-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true -Duser.timezone=GMT",
            "--conf",
            "spark.sql.session.timeZone=UTC",  # ignore local timezone
            "--packages",
            f"com.google.cloud.spark:spark-bigquery-with-dependencies_{self.BQ_CONNECTOR_VERSION}",
            "--jars",
            "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-latest.jar,"
            "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar,"
            "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar",
            "--conf",
            "spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem",
            "--conf",
            "spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem",
        ])

        if self.additional_options is not None:
            for option, value in self.additional_options.items():
                submission_cmd.extend(["--conf", f'"{option}"="{value}"'])

        submission_cmd.append(job_params.get_main_file_path())
        submission_cmd.extend(job_params.get_arguments())

        return subprocess.Popen(submission_cmd)
Ejemplo n.º 3
0
    def spark_submit(self, job_params: SparkJobParameters) -> subprocess.Popen:
        submission_cmd = [
            self.spark_submit_script_path,
            "--master",
            self.master_url,
            "--name",
            job_params.get_name(),
        ]

        if job_params.get_class_name():
            submission_cmd.extend(["--class", job_params.get_class_name()])

        submission_cmd.append(job_params.get_main_file_path())
        submission_cmd.extend(job_params.get_arguments())

        return subprocess.Popen(submission_cmd)
Ejemplo n.º 4
0
    def dataproc_submit(
        self, job_params: SparkJobParameters
    ) -> Tuple[Job, Callable[[], Job], Callable[[], None]]:
        local_job_id = str(uuid.uuid4())
        main_file_uri = self._stage_file(job_params.get_main_file_path(),
                                         local_job_id)
        job_config: Dict[str, Any] = {
            "reference": {
                "job_id": local_job_id
            },
            "placement": {
                "cluster_name": self.cluster_name
            },
            "labels": {
                self.JOB_TYPE_LABEL_KEY:
                job_params.get_job_type().name.lower()
            },
        }

        # Add job hash to labels only for the stream ingestion job
        if isinstance(job_params, StreamIngestionJobParameters):
            job_config["labels"][
                self.JOB_HASH_LABEL_KEY] = job_params.get_job_hash()

        if job_params.get_class_name():
            job_config.update({
                "spark_job": {
                    "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS,
                    "main_class": job_params.get_class_name(),
                    "args": job_params.get_arguments(),
                    "properties": {
                        "spark.yarn.user.classpath.first": "true"
                    },
                }
            })
        else:
            job_config.update({
                "pyspark_job": {
                    "main_python_file_uri": main_file_uri,
                    "jar_file_uris": self.EXTERNAL_JARS,
                    "args": job_params.get_arguments(),
                }
            })

        job = self.job_client.submit_job(
            request={
                "project_id": self.project_id,
                "region": self.region,
                "job": job_config,
            })

        refresh_fn = partial(
            self.job_client.get_job,
            project_id=self.project_id,
            region=self.region,
            job_id=job.reference.job_id,
        )
        cancel_fn = partial(self.dataproc_cancel, job.reference.job_id)

        return job, refresh_fn, cancel_fn
Ejemplo n.º 5
0
 def dataproc_submit(self, job_params: SparkJobParameters) -> Operation:
     local_job_id = str(uuid.uuid4())
     pyspark_gcs = self._stage_files(job_params.get_main_file_path(),
                                     local_job_id)
     job_config = {
         "reference": {
             "job_id": local_job_id
         },
         "placement": {
             "cluster_name": self.cluster_name
         },
         "pyspark_job": {
             "main_python_file_uri": pyspark_gcs,
             "args": job_params.get_arguments(),
         },
     }
     return self.job_client.submit_job_as_operation(
         request={
             "project_id": self.project_id,
             "region": self.region,
             "job": job_config,
         })
Ejemplo n.º 6
0
    def spark_submit(self,
                     job_params: SparkJobParameters,
                     ui_port: int = None) -> subprocess.Popen:
        submission_cmd = [
            self.spark_submit_script_path,
            "--master",
            self.master_url,
            "--name",
            job_params.get_name(),
        ]

        if job_params.get_class_name():
            submission_cmd.extend(["--class", job_params.get_class_name()])

        if ui_port:
            submission_cmd.extend(["--conf", f"spark.ui.port={ui_port}"])

        if job_params.get_extra_options():
            submission_cmd.extend(job_params.get_extra_options().split(" "))

        submission_cmd.append(job_params.get_main_file_path())
        submission_cmd.extend(job_params.get_arguments())

        return subprocess.Popen(submission_cmd)
Ejemplo n.º 7
0
    def dataproc_submit(
        self, job_params: SparkJobParameters, extra_properties: Dict[str, str]
    ) -> Tuple[Job, Callable[[], Job], Callable[[], None]]:
        local_job_id = str(uuid.uuid4())
        main_file_uri = self._stage_file(job_params.get_main_file_path(),
                                         local_job_id)
        job_config: Dict[str, Any] = {
            "reference": {
                "job_id": local_job_id
            },
            "placement": {
                "cluster_name": self.cluster_name
            },
            "labels": {
                self.JOB_TYPE_LABEL_KEY:
                job_params.get_job_type().name.lower()
            },
        }

        maven_package_properties = {
            "spark.jars.packages": ",".join(job_params.get_extra_packages())
        }
        common_properties = {
            "spark.executor.instances": self.executor_instances,
            "spark.executor.cores": self.executor_cores,
            "spark.executor.memory": self.executor_memory,
        }

        if isinstance(job_params, StreamIngestionJobParameters):
            job_config["labels"][
                self.FEATURE_TABLE_LABEL_KEY] = _truncate_label(
                    job_params.get_feature_table_name())
            # Add job hash to labels only for the stream ingestion job
            job_config["labels"][
                self.JOB_HASH_LABEL_KEY] = job_params.get_job_hash()

        if isinstance(job_params, BatchIngestionJobParameters):
            job_config["labels"][
                self.FEATURE_TABLE_LABEL_KEY] = _truncate_label(
                    job_params.get_feature_table_name())

        if job_params.get_class_name():
            scala_job_properties = {
                "spark.yarn.user.classpath.first": "true",
                "spark.executor.instances": self.executor_instances,
                "spark.executor.cores": self.executor_cores,
                "spark.executor.memory": self.executor_memory,
                "spark.pyspark.driver.python": "python3.7",
                "spark.pyspark.python": "python3.7",
            }

            job_config.update({
                "spark_job": {
                    "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS,
                    "main_class": job_params.get_class_name(),
                    "args": job_params.get_arguments(),
                    "properties": {
                        **scala_job_properties,
                        **common_properties,
                        **maven_package_properties,
                        **extra_properties,
                    },
                }
            })
        else:
            job_config.update({
                "pyspark_job": {
                    "main_python_file_uri": main_file_uri,
                    "jar_file_uris": self.EXTERNAL_JARS,
                    "args": job_params.get_arguments(),
                    "properties": {
                        **common_properties,
                        **maven_package_properties,
                        **extra_properties,
                    },
                }
            })

        job = self.job_client.submit_job(
            request={
                "project_id": self.project_id,
                "region": self.region,
                "job": job_config,
            })

        refresh_fn = partial(
            self.job_client.get_job,
            project_id=self.project_id,
            region=self.region,
            job_id=job.reference.job_id,
        )
        cancel_fn = partial(self.dataproc_cancel, job.reference.job_id)

        return job, refresh_fn, cancel_fn