Esempio n. 1
0
class _BaseSparkTask(Task):
    _conf__task_type_name = TaskType.spark

    # will get it from env
    spark_config = parameter(
        config_path=from_task_env("spark_config"))[SparkConfig]
    spark_engine = parameter(config_path=from_task_env("spark_engine"))[
        SparkEngineConfig]  # type: Union[EmrConfig]

    python_script = None
    main_class = None

    def application_args(self):
        """
        'Arguments for the application being submitted'
        :return: list
        """
        return []

    def _get_spark_ctrl(self):
        # type: ()-> SparkCtrl
        return self.spark_engine.get_spark_ctrl(self.current_task_run)

    def _task_banner(self, banner, verbosity):
        b = banner

        b.new_section()
        try:
            spark_command_line = subprocess.list2cmdline(
                list_of_strings(self.application_args()))
            b.column("SPARK CMD LINE", spark_command_line)
        except Exception:
            logger.exception("Failed to get spark command line from %s" % self)
Esempio n. 2
0
class _BeamTask(Task):
    beam_config = parameter(config_path=from_task_env("beam_config"))[ApacheBeamConfig]
    beam_engine = parameter(config_path=from_task_env("beam_engine"))[
        EngineConfig
    ]  # type: DataflowConfig

    def get_beam_task_options(self):
        """
        'Arguments for the application being submitted'
        :return: list
        """
        return {}

    def _get_job_ctrl(self):
        # type:(...)->ApacheBeamJobCtrl
        return self.beam_engine.get_beam_ctrl(self.current_task_run)
Esempio n. 3
0
class DockerRunTask(Task):
    # _conf__task_family = "docker_cmd"

    image = parameter(
        description="Docker image from which to create the container."
        "If image tag is omitted, 'latest' will be used.")[str]
    command = parameter(
        description="Command to be run in the container. (templated)")[str]

    docker_engine = parameter(
        config_path=from_task_env("docker_engine"))[ContainerEngineConfig]

    docker_ctrl = None  # type: DockerRunCtrl

    def _task_submit(self):
        if hasattr(self.ctrl, "airflow_op"):
            airflow_context = self.current_task_run.airflow_context
            self.command = self.ctrl.airflow_op.render_template(
                self.command, airflow_context)

        self.log_metric("docker command", self.command)
        self.docker_ctrl = self.docker_engine.get_docker_ctrl(
            self.current_task_run)  # type: DockerRunCtrl
        self.docker_ctrl.docker_run()

    def on_kill(self):
        if self.docker_ctrl is not None:
            logger.error("Killing submitted docker for %s", self.task_id)
            return self.docker_ctrl.on_kill()
Esempio n. 4
0
class _BaseSparkTask(Task):
    _conf__task_type_name = TaskType.spark

    # will get it from env
    spark_config = parameter(
        config_path=from_task_env("spark_config"))[SparkConfig]
    spark_engine = parameter(config_path=from_task_env("spark_engine"))[
        SparkEngineConfig]  # type: Union[EmrConfig]

    python_script = None
    main_class = None

    spark_conf_extension = parameter(
        default={},
        description="This is an extension for SparkConfig.conf dict, "
        "every config added to this dict will be merged to spark_config.conf",
        significant=False,
    )[dict]

    spark_resources = parameter.c(default=None, system=True)[Dict[str,
                                                                  FileTarget]]

    def band(self):
        result = super(_BaseSparkTask, self).band()

        if self.spark_config.include_user_project:
            fat_wheel_task = fat_wheel_building_task(
                task_version=try_get_databand_context().current_context_uid,
                task_target_date="today",
            )
            self.spark_resources = {"user_project": fat_wheel_task}

        if self.spark_engine.disable_task_band:
            logger.debug("Task band is disabled due to disable_task_band flag")
            self.task_band = None

        return result

    def get_py_files(self):
        py_files = self.spark_config.py_files.copy()
        if self.spark_resources and "user_project" in self.spark_resources:
            project_files = self.spark_resources["user_project"].load(str)
            py_files.append(project_files)
        return py_files

    def application_args(self):
        """
        'Arguments for the application being submitted'
        :return: list
        """
        return []

    def _get_spark_ctrl(self):
        # type: ()-> SparkCtrl
        return self.spark_engine.get_spark_ctrl(self.current_task_run)

    def _task_banner(self, banner, verbosity):
        b = banner

        b.new_section()
        try:
            spark_command_line = subprocess.list2cmdline(
                list_of_strings(self.application_args()))
            b.column("SPARK CMD LINE", spark_command_line)
        except Exception:
            logger.exception("Failed to get spark command line from %s" % self)

    def get_root(self):
        return self.spark_engine.root or super(_BaseSparkTask, self).get_root()

    def _initialize(self):
        super(_BaseSparkTask, self)._initialize()

        if self.spark_conf_extension:
            # adds the last layer for SparkConfig.conf
            self.spark_config.conf.update(self.spark_conf_extension)