class RunConfig(config.Config): """Databand's per run settings (e.g. execution date)""" _conf__task_family = "run" # on none generated at DatabandRun name = parameter.value(default=None, description="Specify run name")[str] description = parameter.value(default=None, description="Specify run description")[ Optional[str] ] parallel = parameter(default=None)[bool] task_executor_type = parameter( default=None, description="Alternate executor type: " " local/airflow_inprocess/airflow_multiprocess_local/airflow_kubernetes," " see docs for more options", )[str] submit_driver = parameter( description="override env.submit_driver for specific environment" ).none[bool] submit_tasks = parameter( description="override env.submit_tasks for specific environment" ).none[bool] enable_airflow_kubernetes = parameter( default=True, description="Enable use of kubernetes executor for kubebernetes engine submission", )[bool] execution_date = parameter(default=None, description="Override execution date")[ datetime ] # Execution specific id = parameter(default=None, description="The list of task ids to run")[List[str]] selected_tasks_regex = parameter( default=None, description="Run only specified tasks (regular expresion)" )[List[str]] ignore_dependencies = parameter( description="The regex to filter specific task_ids" ).value(False) ignore_first_depends_on_past = parameter( description="The regex to filter specific task_ids" ).value(False) pool = parameter(default=None, description="Resource pool to use")[str] donot_pickle = parameter( description="Do not attempt to pickle the DAG object to send over " "to the workers, just tell the workers to run their version " "of the code." ).value(False) mark_success = parameter( description="Mark jobs as succeeded without running them" ).value(False) skip_completed = parameter( description="Mark jobs as succeeded without running them" ).value(True) fail_fast = parameter( description="Skip all remaining tasks if a task has failed" ).value(True) enable_prod = parameter(description="Enable production tasks").value(False) is_archived = parameter(description="Save this run in the archive").value(False) heartbeat_interval_s = parameter( description="How often a run should send a heartbeat to the server. Set -1 to disable" )[int] heartbeat_timeout_s = parameter( description="How old can a run's last heartbeat be before we consider it failed. Set -1 to disable" )[int] heartbeat_sender_log_to_file = parameter( description="create a separate log file for the heartbeat sender and don't log the run process stdout" )[bool] open_web_tracker_in_browser = parameter( description="If True, open web tracker in browser during task run." ).value(False) enable_concurent_sqlite = parameter( description="Enable concurrent execution with sqlite db (use only for debug!)" ).value(False) interactive = parameter( default=False, description="When submitting driver to remote execution keep tracking of submitted process and wait for completion", )[bool] skip_completed_on_run = parameter(default=True).help( "Should dbnd task check that task is completed and mark it as resued on task execution" )[bool] validate_task_inputs = parameter(default=True).help( "Should dbnd task check that all input files exist" )[bool] validate_task_outputs = parameter(default=True).help( "Should dbnd task check that all outputs exist after task has been executed" )[bool] validate_task_outputs_on_build = parameter(default=False).help( "Should dbnd task check that there are no incomplete outputs before task executes" )[bool] tracking_with_cache = parameter(default=False).help( "Should dbnd cache results during tracking" )[bool] pipeline_band_only_check = parameter(default=False).help( "When checking if pipeline is completed, check only if the band file exist (skip the tasks)" )[bool] task_complete_parallelism_level = parameter(default=1).help( "Number of threads to use when checking if tasks are already complete" )[int] dry = parameter(default=False).help( "Do not execute tasks, stop before sending them to the execution, and print their status" )[bool]
class RunConfig(config.Config): """Databand's per run settings (e.g. execution date)""" _conf__task_family = "run" ###### # on none generated at DatabandRun name = parameter.value(default=None, description="Specify run name")[str] description = parameter.value(default=None, description="Specify run description")[ Optional[str] ] # Executor configuration parallel = parameter(default=None)[bool] task_executor_type = parameter( default=None, description="Alternate executor type: " " local/airflow_inprocess/airflow_multiprocess_local/airflow_kubernetes," " see docs for more options", )[str] enable_airflow_kubernetes = parameter( default=True, description="Enable use of kubernetes executor for kubebernetes engine submission", )[bool] ###### # Local/Remote control interactive = parameter( default=False, description="When submitting driver to remote execution keep tracking of submitted process and wait for completion", )[bool] submit_driver = parameter( description="override env.submit_driver for specific environment" ).none[bool] submit_tasks = parameter( description="override env.submit_tasks for specific environment" ).none[bool] # What to do on run open_web_tracker_in_browser = parameter( description="If True, open web tracker in browser during task run." ).value(False) is_archived = parameter(description="Save this run in the archive").value(False) dry = parameter(default=False).help( "Do not execute tasks, stop before sending them to the execution, and print their status" )[bool] run_result_json_path = parameter(default=None).help( "The path to save the task band of the run" )[str] debug_pydevd_pycharm_port = parameter(default=None).help( "Enable debugging with `pydevd_pycharm` by setting this to the port value expecting the debugger to connect.\n" "This will start a new `settrace` connecting to `localhost` on the requested port, " "right before starting the driver task_run." )[int] ###### # AIRFLOW EXECUTOR CONFIG execution_date = parameter(default=None, description="Override execution date")[ datetime ] mark_success = parameter( description="Mark jobs as succeeded without running them" ).value(False) ###### # Task Selectors (to schedule specific task from pipeline) id = parameter(default=None, description="The list of task ids to run")[List[str]] selected_tasks_regex = parameter( default=None, description="Run only specified tasks (regular expresion)" )[List[str]] ignore_dependencies = parameter( description="The regex to filter specific task_ids" ).value(False) ignore_first_depends_on_past = parameter( description="The regex to filter specific task_ids" ).value(False) ###### # Scheduler configuration skip_completed = parameter( description="Mark jobs as succeeded without running them" ).value(True) fail_fast = parameter( description="Skip all remaining tasks if a task has failed" ).value(True) enable_prod = parameter(description="Enable production tasks").value(False) skip_completed_on_run = parameter(default=True).help( "Should dbnd task check that task is completed and mark it as re-used on task execution" )[bool] validate_task_inputs = parameter(default=True).help( "Should dbnd task check that all input files exist" )[bool] validate_task_outputs = parameter(default=True).help( "Should dbnd task check that all outputs exist after task has been executed" )[bool] validate_task_outputs_on_build = parameter(default=False).help( "Should dbnd task check that there are no incomplete outputs before task executes" )[bool] pipeline_band_only_check = parameter(default=False).help( "When checking if pipeline is completed, check only if the band file exist (skip the tasks)" )[bool] recheck_circle_dependencies = parameter( description="Re check circle dependencies on every task creation," " use it if you need to find of circle in your graph " ).value(False) task_complete_parallelism_level = parameter(default=1).help( "Number of threads to use when checking if tasks are already complete" )[int] pool = parameter(default=None, description="Resource pool to use")[str] ###### # Advanced Run settings (debug/workarounds) # run .pickle file always_save_pipeline = parameter( description="Boolean for always saving pipeline to pickle" ).value(False) disable_save_pipeline = parameter( description="Boolean for disabling pipeline pickling" ).value(False) donot_pickle = parameter( description="Do not attempt to pickle the DAG object to send over " "to the workers, just tell the workers to run their version " "of the code." ).value(False) pickle_handler = parameter( default=None, description="Defines a python pickle handler to be used to pickle the " "run's data", )[str] enable_concurent_sqlite = parameter( description="Enable concurrent execution with sqlite db (use only for debug!)" ).value(False) ###### # HEARTBEAT (process that updates on driver status every `heartbeat_interval_s` # heartbeat_interval_s = parameter( description="How often a run should send a heartbeat to the server. Set -1 to disable" )[int] heartbeat_timeout_s = parameter( description="How old can a run's last heartbeat be before we consider it failed. Set -1 to disable" )[int] heartbeat_sender_log_to_file = parameter( description="create a separate log file for the heartbeat sender and don't log the run process stdout" )[bool] hearbeat_disable_plugins = parameter( default=False, description="disable dbnd plugins at heartbeat sub-process" )[bool] ###### # Task/Pipeline in task Execution task_run_at_execution_time_enabled = parameter( default=True, description="Allow tasks calls during another task execution" )[bool] task_run_at_execution_time_in_memory_outputs = parameter( default=False, description="Store outputs for inline task at execution time in memory (do not use FileSystem)", )[bool] target_cache_on_access = parameter( default=True, description="Cache targets values in memory during execution" )[bool]
class Task(_BaseTask, _TaskParamContainer): """ This is the base class of all dbnd Tasks, the base unit of work in databand. A dbnd Task describes a unit or work. The key methods of a Task, which must be implemented in a subclass are: * :py:meth:`run` - the computation done by this task. Each :py:class:`~dbnd.parameter` of the Task should be declared as members: .. code:: python class MyTask(dbnd.Task): count = dbnd.parameter[int] second_param = dbnd.parameter[str] In addition to any declared properties and methods, there are a few non-declared properties, which are created by the :py:class:`TaskMetaclass` metaclass: """ """ This value can be overriden to set the namespace that will be used. (See :ref:`Task.namespaces_famlies_and_ids`) If it's not specified and you try to read this value anyway, it will return garbage. Please use :py:meth:`get_task_namespace` to read the namespace. Note that setting this value with ``@property`` will not work, because this is a class level value. """ _task_band_result = output(default=None, system=True) _meta_output = output( system=True, output_name="meta", output_ext="", target_config=folder, significant=False, description="Location of all internal outputs (e.g. metrics)", ) task_band = output.json(output_name="band") task_enabled = parameter.system(scope=ParameterScope.children)[bool] task_enabled_in_prod = parameter.system( scope=ParameterScope.children)[bool] # for permanent bump of task version use Task.task_class_version task_version = parameter( description="task version, directly affects task signature ", scope=ParameterScope.children, )[VersionStr] task_class_version = parameter.value( default=DEFAULT_CLASS_VERSION, system=True, description="task code version, " "use while you want persistent change in your task version", ) task_env = parameter.value(description="task environment name", scope=ParameterScope.children)[EnvConfig] task_target_date = parameter(description="task data target date", scope=ParameterScope.children)[datetime.date] task_airflow_op_kwargs = parameter.system( default=None, description="airflow operator kwargs")[Dict[str, object]] task_config = parameter.system(empty_default=True)[Dict] task_is_system = parameter.system(default=False)[bool] task_in_memory_outputs = parameter.system( scope=ParameterScope.children, description="Store all task outputs in memory")[bool] task_is_dynamic = parameter.system( scope=ParameterScope.children, description="task was executed from within another task", )[bool] # for example: if task.run doesn't have access to databand, we can't run runtime tasks task_supports_dynamic_tasks = parameter.system( default=True, description="indicates if task can run dynamic databand tasks")[bool] task_retries = parameter.system( description= "Total number of attempts to run the task. So task_retries=3 -> task can fail 3 times before we give up" )[int] task_retry_delay = parameter.system( description="timedelta to wait before retrying a task. Example: 5s")[ datetime.timedelta] _dbnd_call_state = None # type: TaskCallState def __init__(self, **kwargs): super(Task, self).__init__(**kwargs) self.ctrl = TaskCtrl(self) def band(self): """ Please, do not override this function only in Pipeline/External tasks! we do all wiring work in Meta classes only Our implementation should never be coupled to code! :return: """ return def run(self): """ The task run method, to be overridden in a subclass. See :ref:`Task.run` """ pass # default impl @property def task_outputs(self): """ The output that this Task produces. The output of the Task determines if the Task needs to be run--the task is considered finished iff the outputs all exist. See :ref:`Task.task_outputs` """ return self.ctrl.relations.task_outputs_user @property def task_dag(self): # type: (...)->_TaskDagNode return self.ctrl.task_dag def _complete(self): """ If the task has any outputs, return ``True`` if all outputs exist. Otherwise, return ``False``. However, you may freely override this method with custom logic. """ # we check only user side task outputs # all system tasks outputs are not important (if the exists or not) # user don't see them outputs = flatten(self.task_outputs) if len(outputs) == 0: warnings.warn( "Task %r without outputs has no custom complete() method" % self, stacklevel=2, ) return False return all((o.exists() for o in outputs)) @property def current_task_run(self): # type: ()->TaskRun return get_databand_run().get_task_run(self.task_id) def _output(self): """ The default output that this Task produces. Use outputs! Override only if you are writing "base" class """ return NOTHING def _requires(self): """ Override in "template" tasks which themselves are supposed to be subclassed Must return an iterable which among others contains the _requires() of the superclass. See :ref:`Task.requires` """ pass def _task_submit(self): """ Task submission logic, by default we just call -> _task_run() -> run() """ return self._task_run() def _task_run(self): # bring all relevant files self.current_task_run.sync_local.sync_pre_execute() with self._auto_load_save_params(auto_read=self._conf_auto_read_params, save_on_change=True): result = self.run() self.current_task_run.sync_local.sync_post_execute() # publish all relevant files return result def set_upstream(self, task_or_task_list): self.task_dag.set_upstream(task_or_task_list) def set_downstream(self, task_or_task_list): self.task_dag.set_downstream(task_or_task_list) def __lshift__(self, other): return self.set_upstream(other) def __rshift__(self, other): return self.set_downstream(other) def set_global_upstream(self, task_or_task_list): self.task_dag.set_global_upstream(task_or_task_list) @property def metrics(self): # backward compatible code return self.current_task_run.tracker def log_dataframe( self, key, df, with_preview=True, with_schema=True, with_size=True, with_stats=False, ): meta_conf = ValueMetaConf( log_preview=with_preview, log_schema=with_schema, log_size=with_size, log_stats=with_stats, ) self.metrics.log_dataframe(key, df, meta_conf=meta_conf) def log_metric(self, key, value, source=None): """ Logs the passed-in parameter under the current run, creating a run if necessary. :param key: Parameter name (string) :param value: Parameter value (string) """ return self.metrics.log_metric(key, value, source=source) def log_system_metric(self, key, value): """Shortcut for log_metric(..., source="system") """ return self.log_metric(key, value, source="system") def log_artifact(self, name, artifact): """Log a local file or directory as an artifact of the currently active run.""" return self.metrics.log_artifact(name, artifact) def get_template_vars(self): # TODO: move to cached version, (after relations are built) base = { "task": self, "task_family": self.task_meta.task_family, "task_name": self.task_meta.task_name, "task_signature": self.task_meta.task_signature, "task_id": self.task_meta.task_id, } base.update(self._params.get_params_serialized(input_only=True)) if self.task_target_date is None: base["task_target_date"] = "input" return base def on_kill(self): """ Override this method to cleanup subprocesses when a task instance gets killed. Any use of the threading, subprocess or multiprocessing module within an operator needs to be cleaned up or it will leave ghost processes behind. """ pass def _get_task_output_path_format(self, output_mode): if self.task_env.production and output_mode == OutputMode.prod_immutable: return self.settings.output.path_prod_immutable_task return self._conf__base_output_path_fmt or self.settings.output.path_task def get_target(self, name, config=None, output_ext=None, output_mode=None): name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999) config = config or TargetConfig() path_pattern = self._get_task_output_path_format(output_mode) path = calculate_path( task=self, name=name, output_ext=output_ext, is_dir=config.folder, path_pattern=path_pattern, ) return target(path, config=config) def get_root(self): return self.task_env.root def _initialize(self): super(Task, self)._initialize() self.ctrl._initialize_task() def _should_run(self): if not self.task_enabled: return False if self.task_env.production: return self.task_enabled_in_prod or self.settings.run.enable_prod return True @dbnd_handle_errors(exit_on_error=False) def dbnd_run(self): # type: (...)-> DatabandRun """ Run task via Databand execution system """ # this code should be executed under context! from dbnd._core.current import get_databand_context ctx = get_databand_context() result = ctx.dbnd_run_task(self) return result
class Task(_TaskWithParams, _TaskCtrlMixin, _TaskParamContainer): """ This is the base class of all dbnd Tasks, the base unit of work in databand. A dbnd Task describes a unit or work. A ``run`` method must be present in a subclass Each ``parameter`` of the Task should be declared as members:: class MyTask(dbnd.Task): count = dbnd.parameter[int] second_param = dbnd.parameter[str] """ _conf_confirm_on_kill_msg = None # get user confirmation on task kill if not empty _conf__require_run_dump_file = False _task_band_result = output(default=None, system=True) _meta_output = output( system=True, output_name="meta", output_ext="", target_config=folder, significant=False, description="Location of all internal outputs (e.g. metrics)", ) task_band = output.json(output_name="band", system=True) task_enabled = system_passthrough_param(default=True)[bool] task_enabled_in_prod = system_passthrough_param(default=True)[bool] validate_no_extra_params = ParamValidation.error # for permanent bump of task version use Task.task_class_version task_version = parameter( default="1", description="task version, directly affects task signature ", scope=ParameterScope.children, )[VersionStr] task_class_version = parameter.value( default=DEFAULT_CLASS_VERSION, system=True, description="task code version, " "use while you want persistent change in your task version", ) task_env = parameter.value( default="local", description="task environment name", scope=ParameterScope.children, )[EnvConfig] task_target_date = parameter( default="today", description="task data target date", scope=ParameterScope.children, )[datetime.date] task_airflow_op_kwargs = parameter.system( default=None, description="airflow operator kwargs" )[Dict[str, object]] task_config = parameter.system(empty_default=True)[Dict] task_is_system = parameter.system(default=False)[bool] task_in_memory_outputs = system_passthrough_param( default=False, description="Store all task outputs in memory" )[bool] task_output_path_format = system_passthrough_param( default=None, description="Format string used to generate task output paths" )[str] task_is_dynamic = system_passthrough_param( default=False, scope=ParameterScope.children, description="task was executed from within another task", )[bool] # for example: if task.run doesn't have access to databand, we can't run runtime tasks task_supports_dynamic_tasks = parameter.system( default=True, description="indicates if task can run dynamic databand tasks" )[bool] task_retries = parameter.system( default=0, description="Total number of attempts to run the task. So task_retries=3 -> task can fail 3 times before we give up", )[int] task_retry_delay = parameter.system( default="15s", description="timedelta to wait before retrying a task. Example: 5s", )[datetime.timedelta] task_essence = TaskEssence.ORCHESTRATION def __init__(self, **kwargs): super(Task, self).__init__(**kwargs) # used to communicate return value of "user function" self._dbnd_call_state = None # type: Optional[TaskCallState] self.ctrl = TaskCtrl(self) def band(self): """ Please, do not override this function only in Pipeline/External tasks! We do all wiring work in Meta classes only. Our implementation should never be coupled to code! """ return def run(self): """ The task run method, to be overridden in a subclass. See :ref:`Task.run` """ pass # default impl @property def task_outputs(self): """ The output that this Task produces. The output of the Task determines if the Task needs to be run--the task is considered finished iff the outputs all exist. """ return self.ctrl.relations.task_outputs_user @property def task_dag(self): # type: (...)->_TaskDagNode return self.ctrl.task_dag @property def descendants(self): return self.ctrl.descendants def _complete(self): """ If the task has any outputs, return ``True`` if all outputs exist. Otherwise, return ``False``. However, you may freely override this method with custom logic. """ # we check only user side task outputs # all system tasks outputs are not important (if the exists or not) # user don't see them outputs = [ o for o in flatten(self.task_outputs) if not o.config.overwrite_target ] if len(outputs) == 0: if not self.task_band: warnings.warn( "Task %r without outputs has no custom complete() and no task band!" % self, stacklevel=2, ) return False else: return self.task_band.exists() incomplete_outputs = [str(o) for o in outputs if not o.exists()] num_of_incomplete_outputs = len(incomplete_outputs) if 0 < num_of_incomplete_outputs < len(outputs): complete_outputs = [str(o) for o in outputs if o.exists()] exc = incomplete_output_found_for_task( self.task_name, complete_outputs, incomplete_outputs ) if self.settings.run.validate_task_outputs_on_build: raise exc else: logger.warning(str(exc)) return num_of_incomplete_outputs == 0 @property def current_task_run(self): # type: ()->TaskRun return get_databand_run().get_task_run(self.task_id) def _output(self): """ The default output that this Task produces. Use outputs! Override only if you are writing "base" class. """ return NOTHING def _requires(self): """ Override in "template" tasks which themselves are supposed to be subclassed. Must return an iterable which, among others, contains the _requires() of the superclass. """ def _task_submit(self): """Task submission logic, by default we just call -> ``_task_run()`` -> ``run()``.""" return self._task_run() def _task_run(self): # bring all relevant files self.current_task_run.sync_local.sync_pre_execute() param_values = self.task_params.get_param_values() with auto_load_save_params( task=self, auto_read=self._conf_auto_read_params, param_values=param_values ): result = self.run() self.current_task_run.sync_local.sync_post_execute() # publish all relevant files return result @property def tracker(self): return self.current_task_run.tracker @property def metrics(self): # backward compatible code return self.tracker def get_template_vars(self): # TODO: move to cached version, (after relations are built) base = { "task": self, "task_family": self.task_family, "task_name": self.task_name, "task_signature": self.task_signature, "task_id": self.task_id, } base.update(self._params.get_params_serialized(ParameterFilters.INPUTS)) if self.task_target_date is None: base["task_target_date"] = "input" return base def on_kill(self): """ Override this method to cleanup subprocesses when a task instance gets killed. Any use of the threading, subprocess or multiprocessing module within an operator needs to be cleaned up or it will leave ghost processes behind. """ def _get_task_output_path_format(self, output_mode): """ Defines the format string used to generate all task outputs. For example: {root}/{env_label}/{task_target_date}/{task_name}/{task_name}{task_class_version}_{task_signature}/{output_name}{output_ext} """ if self.task_output_path_format: # explicit input - first priority return self.task_output_path_format if self._conf__base_output_path_fmt: # from class definition return self._conf__base_output_path_fmt # default behaviour if self.task_env.production and output_mode == OutputMode.prod_immutable: return self.settings.output.path_prod_immutable_task return self.settings.output.path_task def get_target(self, name, config=None, output_ext=None, output_mode=None): name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999) config = config or TargetConfig() path_pattern = self._get_task_output_path_format(output_mode) path = calculate_path( task=self, name=name, output_ext=output_ext, is_dir=config.folder, path_pattern=path_pattern, ) return target(path, config=config) def get_root(self): return self.task_env.root def _initialize(self): super(Task, self)._initialize() self.ctrl._initialize_task() def _should_run(self): if not self.task_enabled: return False if self.task_env.production: return self.task_enabled_in_prod or self.settings.run.enable_prod return True def _save_param(self, parameter, original_value, current_value): # type: (ParameterDefinition, Any, Any) -> None # it's output! we are going to save it. # task run doesn't always exist task_run = try_get_current_task_run() access_status = DbndTargetOperationStatus.OK try: if isinstance(original_value, InMemoryTarget): parameter.value_type = get_value_type_of_obj( current_value, parameter.value_type ) parameter.dump_to_target(original_value, current_value) # it's a workaround, we don't want to change parameter for outputs (dynamically) # however, we need proper value type to "dump" preview an other meta. # we will update it only for In memory targets only for now except Exception as ex: access_status = DbndTargetOperationStatus.NOK raise friendly_error.task_execution.failed_to_save_value_to_target( ex, self, parameter, original_value, current_value ) finally: if task_run: try: task_run.tracker.log_parameter_data( parameter=parameter, target=original_value, value=current_value, operation_type=DbndTargetOperationType.write, operation_status=access_status, ) except Exception as ex: logger.warning("Failed to log target to tracking store. %s", ex) @dbnd_handle_errors(exit_on_error=False) def dbnd_run(self): # type: (...)-> DatabandRun """Run task via Databand execution system.""" # this code should be executed under context! from dbnd._core.current import get_databand_context ctx = get_databand_context() run = ctx.dbnd_run_task(self) return run