def execute(self, context):
        hook = DataProcHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to
        )
        job = hook.create_job_template(
            self.task_id, self.cluster_name, "pysparkJob", self.dataproc_properties)

        #  Check if the file is local, if that is the case, upload it to a bucket
        if os.path.isfile(self.main):
            cluster_info = hook.get_cluster(
                project_id=hook.project_id,
                region=self.region,
                cluster_name=self.cluster_name
            )
            bucket = cluster_info['config']['configBucket']
            self.main = self._upload_file_temp(bucket, self.main)
        job.set_python_main(self.main)

        job.add_args(self.arguments)
        job.add_jar_file_uris(self.dataproc_jars)
        job.add_archive_uris(self.archives)
        job.add_file_uris(self.files)
        job.add_python_file_uris(self.pyfiles)
        job.set_job_name(self.job_name)

        hook.submit(hook.project_id, job.build(), self.region)
Esempio n. 2
0
class DataProcCtrl(SparkCtrl):
    def __init__(self, task_run):
        super(DataProcCtrl, self).__init__(task_run=task_run)

        self.dataproc = self.task.dataproc

        gcp_conn_id = self.task_env.conn_id
        self.cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)
        self.cluster_info = self.cluster_hook.get_cluster(
            project_id=self.cluster_hook.project_id,
            region=self.dataproc.region,
            cluster_name=self.dataproc.cluster,
        )
        self.storage = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=gcp_conn_id)

        cluster_temp = self.cluster_info.get("config", {}).get("configBucket")
        if cluster_temp:
            self.remote_sync_root = target("gs://%s/dbnd/sync" % cluster_temp)

    def _get_job_builder(self, job_type):
        job_builder = self.cluster_hook.create_job_template(
            self.task.task_id,
            self.dataproc.cluster,
            job_type=job_type,
            properties=self.config.conf,
        )
        # we will have "unique" job name by set_job_name
        job_builder.set_job_name(self.job.job_name)
        job_builder.add_args(list_of_strings(self.task.application_args()))
        job_builder.add_file_uris(self.deploy.sync_files(self.config.files))
        return job_builder

    def _run_job_builder(self, job_builder):
        self.cluster_hook.submit(self.cluster_hook.project_id,
                                 job_builder.build(), self.dataproc.region)

    def run_spark(self, main_class):
        job_builder = self._get_job_builder(job_type="sparkJob")
        jars = list(self.config.jars)
        # we expect SparkTask to behave like spark_submit api i.e.
        # main_jar is a jar to run, main_class is needed only if jar has no default main.
        # dataproc expects main_jar to have default main, so when both main_jar and main_class are set we
        # need to move main_jar to jars.
        if self.task.main_class:
            jars.append(self.config.main_jar)
            job_builder.set_main(None, self.task.main_class)
        else:
            job_builder.set_main(self.deploy.sync(self.config.main_jar), None)

        job_builder.add_jar_file_uris(self.deploy.sync_files(jars))

        return self._run_job_builder(job_builder)

    def run_pyspark(self, pyspark_script):
        job_builder = self._get_job_builder(job_type="pysparkJob")
        jars = list(self.config.jars)

        if self.config.main_jar:
            jars.append(self.config.main_jar)

        job_builder.add_jar_file_uris(self.deploy.sync_files(jars))
        job_builder.set_python_main(self.deploy.sync(pyspark_script))

        return self._run_job_builder(job_builder)

    @classmethod
    def create_engine(cls):
        from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
        from airflow.contrib.operators import dataproc_operator

        from dbnd._core.current import get_settings

        cloud = get_settings().get_env_config(CloudType.gcp)

        gcp_conn_id = cloud.conn_id

        dataproc_config = DataprocConfig()
        cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)

        return dataproc_operator.DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            project_id=cluster_hook.project_id,
            cluster_name=dataproc_config.cluster,
            gcp_conn_id=gcp_conn_id,
            num_workers=dataproc_config.num_workers,
            zone=dataproc_config.zone,
            network_uri=dataproc_config.network_uri,
            subnetwork_uri=dataproc_config.subnetwork_uri,
            tags=dataproc_config.tags,
            storage_bucket=dataproc_config.storage_bucket,
            init_actions_uris=dataproc_config.init_actions_uris,
            init_action_timeout=dataproc_config.init_action_timeout,
            metadata=dataproc_config.metadata,
            image_version=dataproc_config.image_version,
            properties=dataproc_config.properties,
            master_machine_type=dataproc_config.master_machine_type,
            master_disk_size=dataproc_config.master_disk_size,
            worker_machine_type=dataproc_config.worker_machine_type,
            worker_disk_size=dataproc_config.worker_disk_size,
            num_preemptible_workers=dataproc_config.num_preemptible_workers,
            labels=dataproc_config.labels,
            delegate_to=dataproc_config.delegate_to,
            service_account=dataproc_config.service_account,
            service_account_scopes=dataproc_config.service_account_scopes,
            idle_delete_ttl=dataproc_config.idle_delete_ttl,
            auto_delete_time=dataproc_config.auto_delete_time,
            auto_delete_ttl=dataproc_config.auto_delete_ttl,
        )

    @classmethod
    def terminate_engine(cls):
        from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
        from airflow.contrib.operators import dataproc_operator

        dataproc_config = DataprocConfig()

        gcp_conn_id = get_settings().get_env_config(CloudType.gcp).conn_id

        cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)
        delete_cluster = dataproc_operator.DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=dataproc_config.cluster,
            project_id=cluster_hook.project_id,
            gcp_conn_id=gcp_conn_id,
            region=dataproc_config.region,
        )

        return delete_cluster

    @classmethod
    def get_engine_policy(cls):
        return DataprocConfig().policy