Beispiel #1
0
class Task(_BaseTask, _TaskParamContainer):
    """
    This is the base class of all dbnd Tasks, the base unit of work in databand.

    A dbnd Task describes a unit or work.

    The key methods of a Task, which must be implemented in a subclass are:

    * :py:meth:`run` - the computation done by this task.

    Each :py:class:`~dbnd.parameter` of the Task should be declared as members:

    .. code:: python

        class MyTask(dbnd.Task):
            count = dbnd.parameter[int]
            second_param = dbnd.parameter[str]

    In addition to any declared properties and methods, there are a few
    non-declared properties, which are created by the :py:class:`TaskMetaclass`
    metaclass:

    """
    """
        This value can be overriden to set the namespace that will be used.
        (See :ref:`Task.namespaces_famlies_and_ids`)
        If it's not specified and you try to read this value anyway, it will return
        garbage. Please use :py:meth:`get_task_namespace` to read the namespace.

        Note that setting this value with ``@property`` will not work, because this
        is a class level value.
    """

    _task_band_result = output(default=None, system=True)
    _meta_output = output(
        system=True,
        output_name="meta",
        output_ext="",
        target_config=folder,
        significant=False,
        description="Location of all internal outputs (e.g. metrics)",
    )
    task_band = output.json(output_name="band")

    task_enabled = parameter.system(scope=ParameterScope.children)[bool]
    task_enabled_in_prod = parameter.system(
        scope=ParameterScope.children)[bool]

    # for permanent bump of task version use Task.task_class_version
    task_version = parameter(
        description="task version, directly affects task signature ",
        scope=ParameterScope.children,
    )[VersionStr]

    task_class_version = parameter.value(
        default=DEFAULT_CLASS_VERSION,
        system=True,
        description="task code version, "
        "use while you want persistent change in your task version",
    )

    task_env = parameter.value(description="task environment name",
                               scope=ParameterScope.children)[EnvConfig]

    task_target_date = parameter(description="task data target date",
                                 scope=ParameterScope.children)[datetime.date]

    task_airflow_op_kwargs = parameter.system(
        default=None, description="airflow operator kwargs")[Dict[str, object]]

    task_config = parameter.system(empty_default=True)[Dict]
    task_is_system = parameter.system(default=False)[bool]

    task_in_memory_outputs = parameter.system(
        scope=ParameterScope.children,
        description="Store all task outputs in memory")[bool]
    task_is_dynamic = parameter.system(
        scope=ParameterScope.children,
        description="task was executed from within another task",
    )[bool]

    # for example: if task.run doesn't have access to databand, we can't run runtime tasks
    task_supports_dynamic_tasks = parameter.system(
        default=True,
        description="indicates if task can run dynamic databand tasks")[bool]

    task_retries = parameter.system(
        description=
        "Total number of attempts to run the task. So task_retries=3 -> task can fail 3 times before we give up"
    )[int]

    task_retry_delay = parameter.system(
        description="timedelta to wait before retrying a task. Example: 5s")[
            datetime.timedelta]

    _dbnd_call_state = None  # type: TaskCallState

    def __init__(self, **kwargs):
        super(Task, self).__init__(**kwargs)
        self.ctrl = TaskCtrl(self)

    def band(self):
        """
        Please, do not override this function only in Pipeline/External tasks! we do all wiring work in Meta classes only
        Our implementation should never be coupled to code!
        :return:
        """
        return

    def run(self):
        """
        The task run method, to be overridden in a subclass.

        See :ref:`Task.run`
        """
        pass  # default impl

    @property
    def task_outputs(self):
        """
        The output that this Task produces.

        The output of the Task determines if the Task needs to be run--the task
        is considered finished iff the outputs all exist.
        See :ref:`Task.task_outputs`
        """
        return self.ctrl.relations.task_outputs_user

    @property
    def task_dag(self):
        # type: (...)->_TaskDagNode
        return self.ctrl.task_dag

    def _complete(self):
        """
        If the task has any outputs, return ``True`` if all outputs exist.
        Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        """
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = flatten(self.task_outputs)
        if len(outputs) == 0:
            warnings.warn(
                "Task %r without outputs has no custom complete() method" %
                self,
                stacklevel=2,
            )
            return False

        return all((o.exists() for o in outputs))

    @property
    def current_task_run(self):
        # type: ()->TaskRun
        return get_databand_run().get_task_run(self.task_id)

    def _output(self):
        """
        The default output that this Task produces. Use outputs! Override only if you are writing "base" class
        """
        return NOTHING

    def _requires(self):
        """
        Override in "template" tasks which themselves are supposed to be
        subclassed

        Must return an iterable which among others contains the _requires() of
        the superclass.
        See :ref:`Task.requires`
        """
        pass

    def _task_submit(self):
        """
        Task submission logic, by default we just call -> _task_run() -> run()
        """
        return self._task_run()

    def _task_run(self):
        # bring all relevant files
        self.current_task_run.sync_local.sync_pre_execute()
        with self._auto_load_save_params(auto_read=self._conf_auto_read_params,
                                         save_on_change=True):
            result = self.run()

        self.current_task_run.sync_local.sync_post_execute()
        # publish all relevant files
        return result

    def set_upstream(self, task_or_task_list):
        self.task_dag.set_upstream(task_or_task_list)

    def set_downstream(self, task_or_task_list):
        self.task_dag.set_downstream(task_or_task_list)

    def __lshift__(self, other):
        return self.set_upstream(other)

    def __rshift__(self, other):
        return self.set_downstream(other)

    def set_global_upstream(self, task_or_task_list):
        self.task_dag.set_global_upstream(task_or_task_list)

    @property
    def metrics(self):
        # backward compatible code
        return self.current_task_run.tracker

    def log_dataframe(
        self,
        key,
        df,
        with_preview=True,
        with_schema=True,
        with_size=True,
        with_stats=False,
    ):
        meta_conf = ValueMetaConf(
            log_preview=with_preview,
            log_schema=with_schema,
            log_size=with_size,
            log_stats=with_stats,
        )
        self.metrics.log_dataframe(key, df, meta_conf=meta_conf)

    def log_metric(self, key, value, source=None):
        """
        Logs the passed-in parameter under the current run, creating a run if necessary.
        :param key: Parameter name (string)
        :param value: Parameter value (string)
        """
        return self.metrics.log_metric(key, value, source=source)

    def log_system_metric(self, key, value):
        """Shortcut for log_metric(..., source="system") """
        return self.log_metric(key, value, source="system")

    def log_artifact(self, name, artifact):
        """Log a local file or directory as an artifact of the currently active run."""
        return self.metrics.log_artifact(name, artifact)

    def get_template_vars(self):
        # TODO: move to cached version, (after relations are built)
        base = {
            "task": self,
            "task_family": self.task_meta.task_family,
            "task_name": self.task_meta.task_name,
            "task_signature": self.task_meta.task_signature,
            "task_id": self.task_meta.task_id,
        }
        base.update(self._params.get_params_serialized(input_only=True))
        if self.task_target_date is None:
            base["task_target_date"] = "input"
        return base

    def on_kill(self):
        """
        Override this method to cleanup subprocesses when a task instance
        gets killed. Any use of the threading, subprocess or multiprocessing
        module within an operator needs to be cleaned up or it will leave
        ghost processes behind.
        """
        pass

    def _get_task_output_path_format(self, output_mode):
        if self.task_env.production and output_mode == OutputMode.prod_immutable:
            return self.settings.output.path_prod_immutable_task
        return self._conf__base_output_path_fmt or self.settings.output.path_task

    def get_target(self, name, config=None, output_ext=None, output_mode=None):
        name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999)
        config = config or TargetConfig()
        path_pattern = self._get_task_output_path_format(output_mode)

        path = calculate_path(
            task=self,
            name=name,
            output_ext=output_ext,
            is_dir=config.folder,
            path_pattern=path_pattern,
        )

        return target(path, config=config)

    def get_root(self):
        return self.task_env.root

    def _initialize(self):
        super(Task, self)._initialize()
        self.ctrl._initialize_task()

    def _should_run(self):
        if not self.task_enabled:
            return False

        if self.task_env.production:
            return self.task_enabled_in_prod or self.settings.run.enable_prod

        return True

    @dbnd_handle_errors(exit_on_error=False)
    def dbnd_run(self):
        # type: (...)-> DatabandRun
        """
        Run task via Databand execution system
        """
        # this code should be executed under context!
        from dbnd._core.current import get_databand_context

        ctx = get_databand_context()
        result = ctx.dbnd_run_task(self)
        return result
Beispiel #2
0
class _DbndDriverTask(Task):
    _conf__no_child_params = True
    task_is_system = True
    task_in_memory_outputs = True

    is_driver = parameter[bool]
    is_submitter = parameter[bool]
    execution_date = parameter[datetime]
    sends_heartbeat = parameter[bool]

    host_engine = parameter[EngineConfig]
    target_engine = parameter[EngineConfig]

    task_executor_type = parameter[str]

    # all paths, we make them system, we don't want to check if they are exists
    local_driver_root = output(system=True)[Target]
    local_driver_log = output(system=True)[Target]

    remote_driver_root = output(system=True)[Target]
    driver_dump = output(system=True)[Target]

    def _build_submit_task(self, run):
        if run.root_task:
            raise DatabandRuntimeError(
                "Can't send to remote execution task created via code, only command line is supported"
            )

        # dont' describe in local run, do it in remote run
        settings = self.settings
        settings.system.describe = False

        cmd_line_args = (["run"] + _get_dbnd_run_relative_cmd() +
                         ["--run-driver", str(run.run_uid)])

        args = run.remote_engine.dbnd_executable + cmd_line_args

        root_task = run.remote_engine.submit_to_engine_task(
            env=run.env,
            args=args,
            task_name="dbnd_driver_run",
            interactive=settings.run.interactive,
        )
        root_task._conf_confirm_on_kill_msg = (
            "Ctrl-C Do you want to kill your submitted pipeline?"
            "If selection is 'no', this process will detach from the run.")
        return root_task

    def _build_root_task(self, run):
        # type: (DatabandRun) -> Task
        if self.is_submitter and not self.is_driver:
            return self._build_submit_task(run)
        else:
            if run.root_task:
                # user has created DatabandRun with existing task
                self.task_meta.add_child(run.root_task.task_id)
                return run.root_task

            logger.info("Building main task '%s'", run.root_task_name)
            root_task = get_task_registry().build_dbnd_task(run.root_task_name)
            logger.info(
                "Task %s has been created (%s children)",
                root_task.task_id,
                len(root_task.ctrl.task_dag.subdag_tasks()),
            )
            return root_task

    def is_save_run(self, run, task_runs):

        core_settings = run.context.settings.core
        if core_settings.always_save_pipeline:
            return True
        if core_settings.disable_save_pipeline:
            return False

        if any(tr.task._conf__require_run_dump_file for tr in task_runs):
            return True

        if self.target_engine.require_submit:
            return True

        if self.task_executor_type == TaskExecutorType.local:
            return False

        if is_airflow_enabled():
            from dbnd_airflow.executors import AirflowTaskExecutorType

            return self.task_executor_type not in [
                AirflowTaskExecutorType.airflow_inprocess,
                TaskExecutorType.local,
            ]
        return True

    def build_task_from_cmd_line(self, task_name):
        return

    def build_root_task_runs(self, run):
        """
        called by .run and inline
        :return:
        """
        run.root_task = self._build_root_task(run)

        # for validation only
        run.root_task.task_dag.topological_sort()

        task_runs = TaskRunsBuilder().build_task_runs(
            run, run.root_task, remote_engine=self.target_engine)
        # we need it before to mark root task
        run.add_task_runs(task_runs)
        # for faster access
        run.root_task_run = run.get_task_run(run.root_task.task_id)
        return task_runs

    def run(self):
        driver_task_run = current_task_run()
        run = driver_task_run.run  # type: DatabandRun
        if self.is_submitter:
            run.set_run_state(RunState.RUNNING)

        ctx = run.context
        ctx.settings.git.validate_git_policy()

        # let prepare for remote execution
        run.remote_engine.prepare_for_run(run)

        task_runs = self.build_root_task_runs(run)

        hearbeat = None

        # right now we run describe in local controller only, but we should do that for more
        if self.is_driver:
            if run.context.settings.system.describe:
                run.describe_dag.describe_dag()
                logger.info(run.describe.run_banner("Described!",
                                                    color="blue"))
                return

            root_task_run = run.root_task_run
            run.root_task.ctrl.banner(
                "Main task '%s' has been created!" % root_task_run.task_af_id,
                color="cyan",
                task_run=root_task_run,
            )

            print_tasks_tree(root_task_run.task, task_runs)

            if self.is_save_run(run, task_runs):
                run.save_run()

            if self.sends_heartbeat:
                hearbeat = start_heartbeat_sender(driver_task_run)

        task_runs_to_run = [tr for tr in task_runs if not tr.is_skipped]

        # create executor without driver task!
        task_executor = get_task_executor(
            run,
            task_executor_type=self.task_executor_type,
            host_engine=self.host_engine,
            target_engine=run.root_task_run.task_engine,
            task_runs=task_runs_to_run,
        )

        with nested(hearbeat):
            task_executor.do_run()

        if self.is_driver:
            # This is great success!
            run.set_run_state(RunState.SUCCESS)
            logger.info(run.describe.run_banner_for_finished())
            return run
        else:
            logger.info(run.describe.run_banner_for_submitted())
Beispiel #3
0
class Task(_TaskWithParams, _TaskCtrlMixin, _TaskParamContainer):
    """
    This is the base class of all dbnd Tasks, the base unit of work in databand.

    A dbnd Task describes a unit or work.

    A ``run`` method must be present in a subclass

    Each ``parameter`` of the Task should be declared as members::

        class MyTask(dbnd.Task):
            count = dbnd.parameter[int]
            second_param = dbnd.parameter[str]
    """

    _conf_confirm_on_kill_msg = None  # get user confirmation on task kill if not empty
    _conf__require_run_dump_file = False

    _task_band_result = output(default=None, system=True)
    _meta_output = output(
        system=True,
        output_name="meta",
        output_ext="",
        target_config=folder,
        significant=False,
        description="Location of all internal outputs (e.g. metrics)",
    )
    task_band = output.json(output_name="band", system=True)

    task_enabled = system_passthrough_param(default=True)[bool]
    task_enabled_in_prod = system_passthrough_param(default=True)[bool]
    validate_no_extra_params = ParamValidation.error

    # for permanent bump of task version use Task.task_class_version
    task_version = parameter(
        default="1",
        description="task version, directly affects task signature ",
        scope=ParameterScope.children,
    )[VersionStr]

    task_class_version = parameter.value(
        default=DEFAULT_CLASS_VERSION,
        system=True,
        description="task code version, "
        "use while you want persistent change in your task version",
    )

    task_env = parameter.value(
        default="local",
        description="task environment name",
        scope=ParameterScope.children,
    )[EnvConfig]

    task_target_date = parameter(
        default="today",
        description="task data target date",
        scope=ParameterScope.children,
    )[datetime.date]

    task_airflow_op_kwargs = parameter.system(
        default=None, description="airflow operator kwargs"
    )[Dict[str, object]]

    task_config = parameter.system(empty_default=True)[Dict]
    task_is_system = parameter.system(default=False)[bool]

    task_in_memory_outputs = system_passthrough_param(
        default=False, description="Store all task outputs in memory"
    )[bool]

    task_output_path_format = system_passthrough_param(
        default=None, description="Format string used to generate task output paths"
    )[str]

    task_is_dynamic = system_passthrough_param(
        default=False,
        scope=ParameterScope.children,
        description="task was executed from within another task",
    )[bool]

    # for example: if task.run doesn't have access to databand, we can't run runtime tasks
    task_supports_dynamic_tasks = parameter.system(
        default=True, description="indicates if task can run dynamic databand tasks"
    )[bool]

    task_retries = parameter.system(
        default=0,
        description="Total number of attempts to run the task. So task_retries=3 -> task can fail 3 times before we give up",
    )[int]

    task_retry_delay = parameter.system(
        default="15s",
        description="timedelta to wait before retrying a task. Example: 5s",
    )[datetime.timedelta]

    task_essence = TaskEssence.ORCHESTRATION

    def __init__(self, **kwargs):
        super(Task, self).__init__(**kwargs)

        # used to communicate return value of "user function"
        self._dbnd_call_state = None  # type: Optional[TaskCallState]
        self.ctrl = TaskCtrl(self)

    def band(self):
        """
        Please, do not override this function only in Pipeline/External tasks!

        We do all wiring work in Meta classes only.
        Our implementation should never be coupled to code!
        """
        return

    def run(self):
        """
        The task run method, to be overridden in a subclass.

        See :ref:`Task.run`
        """
        pass  # default impl

    @property
    def task_outputs(self):
        """
        The output that this Task produces.

        The output of the Task determines if the Task needs to be run--the task
        is considered finished iff the outputs all exist.
        """
        return self.ctrl.relations.task_outputs_user

    @property
    def task_dag(self):
        # type: (...)->_TaskDagNode
        return self.ctrl.task_dag

    @property
    def descendants(self):
        return self.ctrl.descendants

    def _complete(self):
        """
        If the task has any outputs, return ``True`` if all outputs exist. Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        """
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = [
            o for o in flatten(self.task_outputs) if not o.config.overwrite_target
        ]
        if len(outputs) == 0:
            if not self.task_band:
                warnings.warn(
                    "Task %r without outputs has no custom complete() and no task band!"
                    % self,
                    stacklevel=2,
                )
                return False
            else:
                return self.task_band.exists()

        incomplete_outputs = [str(o) for o in outputs if not o.exists()]

        num_of_incomplete_outputs = len(incomplete_outputs)

        if 0 < num_of_incomplete_outputs < len(outputs):
            complete_outputs = [str(o) for o in outputs if o.exists()]
            exc = incomplete_output_found_for_task(
                self.task_name, complete_outputs, incomplete_outputs
            )

            if self.settings.run.validate_task_outputs_on_build:
                raise exc
            else:
                logger.warning(str(exc))

        return num_of_incomplete_outputs == 0

    @property
    def current_task_run(self):
        # type: ()->TaskRun
        return get_databand_run().get_task_run(self.task_id)

    def _output(self):
        """
        The default output that this Task produces.

        Use outputs! Override only if you are writing "base" class.
        """
        return NOTHING

    def _requires(self):
        """
        Override in "template" tasks which themselves are supposed to be subclassed.

        Must return an iterable which, among others, contains the _requires() of
        the superclass.
        """

    def _task_submit(self):
        """Task submission logic, by default we just call -> ``_task_run()`` -> ``run()``."""
        return self._task_run()

    def _task_run(self):
        # bring all relevant files
        self.current_task_run.sync_local.sync_pre_execute()
        param_values = self.task_params.get_param_values()

        with auto_load_save_params(
            task=self, auto_read=self._conf_auto_read_params, param_values=param_values
        ):
            result = self.run()

        self.current_task_run.sync_local.sync_post_execute()
        # publish all relevant files
        return result

    @property
    def tracker(self):
        return self.current_task_run.tracker

    @property
    def metrics(self):
        # backward compatible code
        return self.tracker

    def get_template_vars(self):
        # TODO: move to cached version, (after relations are built)
        base = {
            "task": self,
            "task_family": self.task_family,
            "task_name": self.task_name,
            "task_signature": self.task_signature,
            "task_id": self.task_id,
        }
        base.update(self._params.get_params_serialized(ParameterFilters.INPUTS))
        if self.task_target_date is None:
            base["task_target_date"] = "input"
        return base

    def on_kill(self):
        """
        Override this method to cleanup subprocesses when a task instance gets killed.

        Any use of the threading, subprocess or multiprocessing
        module within an operator needs to be cleaned up or it will leave
        ghost processes behind.
        """

    def _get_task_output_path_format(self, output_mode):
        """
        Defines the format string used to generate all task outputs.

        For example:
           {root}/{env_label}/{task_target_date}/{task_name}/{task_name}{task_class_version}_{task_signature}/{output_name}{output_ext}
        """
        if self.task_output_path_format:
            # explicit input - first priority
            return self.task_output_path_format
        if self._conf__base_output_path_fmt:
            # from class definition
            return self._conf__base_output_path_fmt

        # default behaviour
        if self.task_env.production and output_mode == OutputMode.prod_immutable:
            return self.settings.output.path_prod_immutable_task
        return self.settings.output.path_task

    def get_target(self, name, config=None, output_ext=None, output_mode=None):
        name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999)
        config = config or TargetConfig()
        path_pattern = self._get_task_output_path_format(output_mode)

        path = calculate_path(
            task=self,
            name=name,
            output_ext=output_ext,
            is_dir=config.folder,
            path_pattern=path_pattern,
        )

        return target(path, config=config)

    def get_root(self):
        return self.task_env.root

    def _initialize(self):
        super(Task, self)._initialize()
        self.ctrl._initialize_task()

    def _should_run(self):
        if not self.task_enabled:
            return False

        if self.task_env.production:
            return self.task_enabled_in_prod or self.settings.run.enable_prod

        return True

    def _save_param(self, parameter, original_value, current_value):
        # type: (ParameterDefinition, Any, Any) -> None
        # it's output! we are going to save it.
        # task run doesn't always exist
        task_run = try_get_current_task_run()
        access_status = DbndTargetOperationStatus.OK
        try:
            if isinstance(original_value, InMemoryTarget):
                parameter.value_type = get_value_type_of_obj(
                    current_value, parameter.value_type
                )

            parameter.dump_to_target(original_value, current_value)
            # it's a workaround, we don't want to change parameter for outputs (dynamically)
            # however, we need proper value type to "dump" preview an other meta.
            # we will update it only for In memory targets only for now

        except Exception as ex:
            access_status = DbndTargetOperationStatus.NOK
            raise friendly_error.task_execution.failed_to_save_value_to_target(
                ex, self, parameter, original_value, current_value
            )
        finally:
            if task_run:
                try:
                    task_run.tracker.log_parameter_data(
                        parameter=parameter,
                        target=original_value,
                        value=current_value,
                        operation_type=DbndTargetOperationType.write,
                        operation_status=access_status,
                    )
                except Exception as ex:
                    logger.warning("Failed to log target to tracking store. %s", ex)

    @dbnd_handle_errors(exit_on_error=False)
    def dbnd_run(self):
        # type: (...)-> DatabandRun
        """Run task via Databand execution system."""
        # this code should be executed under context!
        from dbnd._core.current import get_databand_context

        ctx = get_databand_context()
        run = ctx.dbnd_run_task(self)
        return run