def namespace(namespace=None, scope=""): """ Call to set namespace of tasks declared after the call. It is often desired to call this function with the keyword argument ``scope=__name__``. The ``scope`` keyword makes it so that this call is only effective for task classes with a matching [*]_ ``__module__``. The default value for ``scope`` is the empty string, which means all classes. Multiple calls with the same scope simply replace each other. The namespace of a :py:class:`Task` can also be changed by specifying the property ``task_namespace``. .. code-block:: python class Task2(dbnd.Task): task_namespace = 'namespace2' This explicit setting takes priority over whatever is set in the ``namespace()`` method, and it's also inherited through normal python inheritence. There's no equivalent way to set the ``task_family``. .. [*] When there are multiple levels of matching module scopes like ``a.b`` vs ``a.b.c``, the more specific one (``a.b.c``) wins. .. seealso:: The new and better scaling :py:func:`auto_namespace` """ get_task_registry().register_namespace(scope=scope, namespace=namespace or "")
def namespace(namespace=None, scope=""): """ Call to set namespace of tasks declared after the call. It is often desired to call this function with the keyword argument ``scope=__name__``. The ``scope`` keyword makes it so that this call is only effective for task classes with a matching ``__module__``. The default value for ``scope`` is the empty string, which means all classes. Multiple calls with the same scope simply replace each other. The namespace of a ``Task`` can also be changed by specifying the property ``task_namespace``. :: class Task2(dbnd.Task): task_namespace = 'namespace2' This explicit setting takes priority over whatever is set in the ``namespace()`` method, and it's also inherited through normal python inheritence. There's no equivalent way to set the ``task_family``. """ get_task_registry().register_namespace(scope=scope, namespace=namespace or "")
def test_no_error_on_same_from(self): @task def task_with_from(): return with config({"task_with_from": {"_from": "task_with_from"}}): get_task_registry().build_dbnd_task("task_with_from")
def test_error_on_same_from(self): with pytest.raises(Exception): with config({ "unknown_task_with_from": { "_from": "unknown_task_with_from" } }): get_task_registry().build_dbnd_task("unknown_task_with_from")
def parse_from_str(self, input): """ Parse a task_famly using the :class:`~dbnd._core.register.Register` """ task_cls = get_task_registry().get_task_cls(input) return task_cls()
def class_or_func_decorator(class_or_func): # this code will run during compile time, when we apply dbnd decorator (for example: @task) task_decorator = TaskDecorator(class_or_func, decorator_kwargs=decorator_kwargs) tp = task_decorator.task_passport # we need to manually register the task here, since in regular flow # this happens in TaskMetaclass, but it's not invoked here due to lazy # evaluation task_cls r = get_task_registry() r.register_task_cls_factory( task_cls_factory=task_decorator.get_task_cls, full_task_family=tp.full_task_family, task_family=tp.task_family, ) if task_decorator.is_class: # we will change metaclass for UserClass so we will process all UserClass calls # # @task # class UserClass(): # pass # so the the moment user call UserClass(), -> _DecoratedUserClassMeta.__call__ will be called dbnd_decorated_class = six.add_metaclass( _UserClassWithTaskDecoratorMetaclass)(class_or_func) dbnd_decorated_class.task_decorator = task_decorator task_decorator.class_or_func = dbnd_decorated_class return dbnd_decorated_class else: # @task # def user_func(): # pass # we will return our wrapper, that will be called during a runtime, # when user calls his own code. return build_dbnd_decorated_func(task_decorator)
def _build( cls, cls_name, module_name, task_namespace, conf__task_family, ): full_task_family = "%s.%s" % (module_name, cls_name) full_task_family_short = "%s.%s" % (_short_name(module_name), cls_name) if not is_defined(task_namespace): namespace_at_class_time = get_task_registry().get_namespace(module_name) if namespace_at_class_time == _SAME_AS_PYTHON_MODULE: task_namespace = module_name else: task_namespace = namespace_at_class_time if conf__task_family: task_family = conf__task_family task_config_section = task_family elif task_namespace: task_family = "{}.{}".format(task_namespace, cls_name) task_config_section = task_family else: task_family = cls_name task_config_section = full_task_family return TaskPassport( full_task_family=full_task_family, full_task_family_short=full_task_family_short, task_family=task_family, task_config_section=task_config_section, )
def __new__(mcs, classname, bases, classdict): """ Custom class creation for namespacing. Also register all subclasses. When the set or inherited namespace evaluates to ``None``, set the task namespace to whatever the currently declared namespace is. """ cls = super(TaskMetaclass, mcs).__new__( mcs, classname, bases, classdict ) # type: typing.Type[_BaseTask] # we are starting from "not clean" classdict -> it's deserialization if classdict.get("task_definition") is not None: return cls td = cls.task_definition = TaskDefinition(cls, classdict) # now we will assign all params set_params = td.class_params if cls.is_tracking_mode else td.all_task_params for k, v in six.iteritems(set_params): setattr(cls, k, v) # every time we see new implementation, we want it to have an priority over old implementation # we need to switch to dict() and store history else where r = get_task_registry() r.register_task(cls) return cls
def __new__(mcs, classname, bases, classdict): """ Custom class creation for namespacing. Also register all subclasses. When the set or inherited namespace evaluates to ``None``, set the task namespace to whatever the currently declared namespace is. """ cls = super(TaskMetaclass, mcs).__new__(mcs, classname, bases, classdict) # type: typing.Type[_BaseTask] # we are starting from "not clean" classdict -> # A. it's deserialization # B. it was calculated before if classdict.get("task_definition") is not None: return cls cls.task_definition = TaskDefinition.from_task_cls(task_class=cls, classdict=classdict) # now we will assign all calculated parameters # so instead of ParameterFactory, we will have ParameterDefinition for k, v in six.iteritems(cls.task_definition.task_param_defs): setattr(cls, k, v) # every time we see new implementation, we want it to have an priority over old implementation # we need to switch to dict() and store history else where r = get_task_registry() r.register_task(cls) return cls
def _get_task_cls(luigi_task): # type: (luigi.Task) -> Type[_LuigiTask] """ Returns the right dbnd-luigi class wrapper base on existing relevant tracker or by creating new one """ task_family = luigi_task.get_task_family() registry = get_task_registry() try: dbnd_task_cls = registry.get_task_cls(str(task_family)) except TaskClassNotFoundException: dbnd_task_cls = _build_new_task_cls(luigi_task) logger.info("Creating new class %s", task_family) return dbnd_task_cls
def test_auto_complete_renew(self): @task def my_task_autocomplete(a): # type: (int)->str """ my task help """ return "ok" task_registry = get_task_registry() task_classes = task_registry.list_dbnd_task_classes() logging.info("task_classes: %s", list(task_registry.list_dbnd_task_classes())) completer.refresh(task_classes) task_completer = completer.task() actual = task_completer(None, None, "my_tas") assert actual == [("my_task_autocomplete", "my task help")]
def _build_root_task(self, run): # type: (DatabandRun) -> Task if self.is_submitter and not self.is_driver: return self._build_submit_task(run) else: if run.root_task: # user has created DatabandRun with existing task self.task_meta.add_child(run.root_task.task_id) return run.root_task logger.info("Building main task '%s'", run.root_task_name) root_task = get_task_registry().build_dbnd_task(run.root_task_name) logger.info( "Task %s has been created (%s children)", root_task.task_id, len(root_task.ctrl.task_dag.subdag_tasks()), ) return root_task
def _list_tasks(ctx, module, search, is_config): from dbnd import Config from dbnd._core.context.databand_context import new_dbnd_context from dbnd._core.parameter.parameter_definition import _ParameterKind formatter = ctx.make_formatter() load_user_modules(config, modules=module) with new_dbnd_context(): tasks = get_task_registry().list_dbnd_task_classes() for task_cls in tasks: td = task_cls.task_definition full_task_family = td.full_task_family task_family = td.task_family if not (task_family.startswith(search) or full_task_family.startswith(search)): continue if issubclass(task_cls, Config) != is_config: continue dl = [] for param_name, param_obj in td.task_param_defs.items(): if param_obj.system or param_obj.kind == _ParameterKind.task_output: continue if not is_config and param_name in COMMON_PARAMS: continue param_help = _help(param_obj.description) dl.append((param_name, param_help)) if dl: with formatter.section( "{task_family} ({full_task_family})".format( full_task_family=full_task_family, task_family=task_family ) ): formatter.write_dl(dl) click.echo(formatter.getvalue().rstrip("\n"))
def build_task(root_task, **kwargs): from dbnd import new_dbnd_context with new_dbnd_context(conf={root_task: kwargs}): return get_task_registry().build_dbnd_task(task_name=root_task)
def run( ctx, is_help, task, module, _sets, _sets_config, _sets_root, _overrides, verbose, describe, env, parallel, conf_file, task_version, project_name, name, description, run_driver, alternative_task_name, scheduled_job_name, scheduled_date, interactive, submit_driver, submit_tasks, disable_web_tracker, ): """ Run a task or a DAG To see tasks use `dbnd show-tasks` (tab completion is available). """ from dbnd._core.context.databand_context import new_dbnd_context, DatabandContext from dbnd._core.utils.structures import combine_mappings from dbnd import config task_name = task # --verbose, --describe, --env, --parallel, --conf-file and --project-name # we filter out false flags since otherwise they will always override the config with their falseness main_switches = dict( databand=filter_dict_remove_false_values( dict( verbose=verbose > 0, describe=describe, env=env, conf_file=conf_file, project_name=project_name, ) ), run=filter_dict_remove_false_values( dict( name=name, parallel=parallel, description=description, is_archived=describe, ) ), ) if submit_driver is not None: main_switches["run"]["submit_driver"] = bool(submit_driver) if submit_tasks is not None: main_switches["run"]["submit_tasks"] = bool(submit_tasks) if disable_web_tracker: main_switches.setdefault("core", {})["tracker_api"] = "disabled" if task_version is not None: main_switches["task"] = {"task_version": task_version} cmd_line_config = parse_and_build_config_store( source="cli", config_values=main_switches ) _sets = list(_sets) _sets_config = list(_sets_config) _sets_root = list(_sets_root) root_task_config = {} for _set in _sets_root: root_task_config = combine_mappings(left=root_task_config, right=_set) # remove all "first level" config values, assume that they are for the main task # add them to _sets_root for _set in _sets: for k, v in list(_set.items()): # so json-like values won't be included if "." not in k and isinstance(v, six.string_types): root_task_config[k] = v del _set[k] # --set, --set-config if _sets: cmd_line_config.update(_parse_cli(_sets, source="--set")) if _sets_config: cmd_line_config.update(_parse_cli(_sets_config, source="--set-config")) if _overrides: cmd_line_config.update( _parse_cli(_overrides, source="--set-override", override=True) ) if interactive: cmd_line_config.update( _parse_cli([{"run.interactive": True}], source="--interactive") ) if verbose > 1: cmd_line_config.update( _parse_cli([{"task_build.verbose": True}], source="-v -v") ) if cmd_line_config: config.set_values(cmd_line_config, source="cmdline") if verbose: logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config)) # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() if not config.getboolean("log", "disabled"): configure_basic_logging(None) scheduled_run_info = None if scheduled_job_name: scheduled_run_info = ScheduledRunInfo( scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date ) with new_dbnd_context( name="run", module=module ) as context: # type: DatabandContext task_registry = get_task_registry() tasks = task_registry.list_dbnd_task_classes() completer.refresh(tasks) # modules are loaded, we can load the task task_cls = None if task_name: task_cls = task_registry.get_task_cls(task_name) if alternative_task_name: task_cls = build_dynamic_task( original_cls=task_cls, new_cls_name=alternative_task_name ) task_name = alternative_task_name # --set-root # now we can get it config, as it's not main task, we can load config after the configuration is loaded if task_cls is not None: if root_task_config: # adding root task to configuration config.set_values( {task_cls.task_definition.task_config_section: root_task_config}, source="--set-root", ) if is_help or not task_name: print_help(ctx, task_cls) return return context.dbnd_run_task( task_or_task_name=task_name, run_uid=run_driver, scheduled_run_info=scheduled_run_info, )
def run_driver(self): logger.info("Running driver... Driver PID: %s", os.getpid()) run = self.run # type: DatabandRun settings = run.context.settings run_executor = run.run_executor remote_engine = run_executor.remote_engine settings.git.validate_git_policy() # let prepare for remote execution remote_engine.prepare_for_run(run) if self.root_task_name_to_build: if self.force_task_name: kwargs = {"task_name": self.force_task_name} logger.info( "Building main task '%s' with name %s", self.root_task_name_to_build, self.force_task_name, ) else: logger.info("Building main task '%s'", self.root_task_name_to_build) kwargs = {} root_task = get_task_registry().build_dbnd_task( self.root_task_name_to_build, task_kwargs=kwargs) logger.info( "Task %s has been created (%s children)", root_task.task_id, len(root_task.ctrl.task_dag.subdag_tasks()), ) run.root_task = root_task # assert that graph is DAG run.root_task.task_dag.topological_sort() # now we init all task runs for all tasks in the pipeline task_runs = self._init_task_runs_for_execution( task_engine=remote_engine) root_task_run = run.root_task_run run.root_task.ctrl.banner( "Main task '%s' has been created!" % root_task_run.task_af_id, color="cyan", task_run=root_task_run, ) if self.run_config.dry: run.root_task.ctrl.describe_dag.describe_dag() logger.warning( "Execution has been stopped due to run.dry=True flag!") return run print_tasks_tree(root_task_run.task, task_runs) if self._is_save_run_pickle(task_runs, remote_engine): run_executor.save_run_pickle() task_runs_to_run = [tr for tr in task_runs if not tr.is_skipped] # THIS IS THE POINT WHEN WE SUBMIT ALL TASKS TO EXECUTION # we should make sure that we create executor without driver task task_executor = get_task_executor( run, task_executor_type=run_executor.task_executor_type, host_engine=run_executor.host_engine, target_engine=remote_engine, task_runs=task_runs_to_run, ) hearbeat = None if self.send_heartbeat: # this will wrap the executor with "heartbeat" process hearbeat = start_heartbeat_sender(self) with nested(hearbeat): task_executor.do_run() # We need place the pipeline's task_band in the place we required to by outside configuration if settings.run.run_result_json_path: new_path = settings.run.run_result_json_path try: self.result_location.copy(new_path) except Exception as e: logger.exception( "Couldn't copy the task_band from {old_path} to {new_path}. Failed with this error: {error}" .format(old_path=self.result_location.path, new_path=new_path, error=e)) else: logger.info( "Copied the pipeline's task_band to {new_path}".format( new_path=new_path)) # if we are in the driver, we want to print banner after executor__task banner run.set_run_state(RunState.SUCCESS) root_task = self.run.root_task_run.task msg = "Your run has been successfully executed!" if self.run.duration: msg = "Your run has been successfully executed in %s" % self.run.duration run_msg = "\n%s\n%s\n" % ( root_task.ctrl.banner( "Main task '%s' is ready!" % root_task.task_name, color="green", task_run=self.run.root_task_run, ), run.describe.run_banner(msg, color="green", show_tasks_info=True), ) logger.info(run_msg) return run
def test_can_not_find(self): with pytest.raises(TaskClassNotFoundException): get_task_registry().get_task_cls("t_config")
def cmd_run( ctx, is_help, task, module, _sets, _sets_config, _sets_root, _overrides, _extend, verbose, print_task_band, describe, env, parallel, conf_file, task_version, project, name, description, run_driver, override_run_uid, alternative_task_name, job_name, scheduled_job_name, scheduled_date, interactive, submit_driver, submit_tasks, disable_web_tracker, open_web_tab, docker_build_tag, ): """ Run a task or a DAG To see all available tasks use `dbnd show-tasks` (tab completion is available). `dbnd show-configs` will print all available configs. """ from dbnd import config from dbnd._core.context.databand_context import DatabandContext, new_dbnd_context from dbnd._core.utils.structures import combine_mappings task_registry = get_task_registry() # we need to do it before we are looking for the task cls load_user_modules(dbnd_config=config, modules=module) task_name = task # --verbose, --describe, --env, --parallel, --conf-file and --project # we filter out false flags since otherwise they will always override the config with their falseness main_switches = dict( databand=dict( verbose=verbose > 0, print_task_band=print_task_band, describe=describe, env=env, conf_file=conf_file, project=project, ), run=dict( name=name, parallel=parallel, interactive=interactive, description=description, is_archived=describe, open_web_tracker_in_browser=open_web_tab, submit_driver=_nullable_flag(submit_driver), submit_tasks=_nullable_flag(submit_tasks), ), kubernetes=dict(docker_build_tag=docker_build_tag), task=dict(task_version=task_version), task_build=dict(verbose=True if verbose > 1 else None), core=dict(tracker_api="disabled" if disable_web_tracker else None), ) main_switches = cleanup_empty_switches(main_switches) _sets = list(_sets) _sets_config = list(_sets_config) _sets_root = list(_sets_root) root_task_config = {} for _set in _sets_root: root_task_config = combine_mappings(left=root_task_config, right=_set) # remove all "first level" config values, assume that they are for the main task # add them to _sets_root for _set in _sets: for k, v in list(_set.items()): # so json-like values won't be included if "." not in k and isinstance(v, six.string_types): root_task_config[k] = v del _set[k] cmd_line_config = parse_and_build_config_store(source="cli", config_values=main_switches) # --set, --set-config if _sets: cmd_line_config.update(_parse_cli(_sets, source="--set")) if _sets_config: cmd_line_config.update(_parse_cli(_sets_config, source="--set-config")) if _extend: cmd_line_config.update( _parse_cli(_extend, source="--extend-config", extend=True)) if _overrides: cmd_line_config.update( _parse_cli( _overrides, source="--set-override", priority=ConfigValuePriority.OVERRIDE, )) # --set-root if root_task_config: task_cls = task_registry.get_task_cls(task_name) task_section = task_cls.task_definition.task_config_section # adding root task to configuration cmd_line_config.update( parse_and_build_config_store( config_values={task_section: root_task_config}, source="--set-root")) # UPDATE CURRENT CONFIG with CLI values if cmd_line_config: if verbose: logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config)) config.set_values(cmd_line_config, source="cmdline") # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() # initialize basic logging (until we get to the context logging if not config.getboolean("log", "disabled"): configure_basic_logging(None) scheduled_run_info = None if scheduled_job_name: scheduled_run_info = ScheduledRunInfo( scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date) # update completer if config.getboolean("databand", "completer"): tasks = task_registry.list_dbnd_task_classes() completer.refresh(tasks) # bootstrap and modules are loaded, we can load the task task_cls = None if task_name: task_cls = task_registry.get_task_cls(task_name) if not task_name: print_help(ctx, None) return if is_help: print_help(ctx, task_cls) return with tracking_mode_context(tracking=False), new_dbnd_context( name="run") as context: # type: DatabandContext if context.settings.system.describe: # we want to print describe without triggering real run logger.info("Building main task '%s'", task_name) root_task = get_task_registry().build_dbnd_task(task_name) root_task.ctrl.describe_dag.describe_dag() # currently there is bug with the click version we have when using python 2 # so we don't use the click.echo function # https://github.com/pallets/click/issues/564 print("Task %s has been described!" % task_name) return root_task return context.dbnd_run_task( task_or_task_name=task_name, force_task_name=alternative_task_name, job_name=job_name or alternative_task_name or task_name, run_uid=run_driver or override_run_uid, existing_run=run_driver is not None, scheduled_run_info=scheduled_run_info, project=project, )
def test_ambigious(self): actual = get_task_registry()._get_task_cls("RAmbiguousClass") assert actual == DbndTaskRegistry.AMBIGUOUS_CLASS
def decorated(class_or_func): try: func_spec = build_task_decorator_spec( class_or_func=class_or_func, decorator_kwargs=decorator_kwargs, default_result=task_default_result, ) except Exception as ex: logger.error( "Failed to create task %s: %s\n%s\n", class_or_func.__name__, str(ex), user_side_code(context=5), exc_info=show_exc_info(ex), ) raise fp = TaskClsBuilder(func_spec, task_type, task_defaults) if func_spec.is_class: wrapper = six.add_metaclass(_DecoratedUserClassMeta)(class_or_func) fp._callable_item = wrapper else: @functools.wraps(class_or_func) def wrapper(*args, **kwargs): if in_tracking_mode(): with fp.tracking_context(args, kwargs) as track_result_callback: return track_result_callback(fp.func(*args, **kwargs)) return _call_handler( fp.get_task_cls(), call_user_code=fp.func, call_args=args, call_kwargs=kwargs, ) wrapper.dbnd_run = fp.dbnd_run wrapper.__is_dbnd_task__ = True wrapper.func = class_or_func # we're using CallableLazyObjectProxy to have lazy evaluation for creating task_cls # this is only orchestration scenarios task_cls = CallableLazyObjectProxy(fp.get_task_cls) wrapper.task_cls = task_cls wrapper.task = task_cls wrapper.t = task_cls # we need lazy task_definition here, for example for dbnd_task_as_bash_operator wrapper.task_definition = CallableLazyObjectProxy( fp.get_task_definition) # we need to manually register the task here, since in regular flow # this happens in TaskMetaclass, but it's not invoked here due to lazy # evaluation using CallableLazyObjectProxy tp = TaskPassport.from_func_spec(func_spec, decorator_kwargs) # TODO: we can use CallableLazyObjectProxy object (task_cls) instead of task_cls_factory r = get_task_registry() r.register_task_cls_factory( task_cls_factory=fp.get_task_cls, full_task_family=tp.full_task_family, task_family=tp.task_family, ) return wrapper
def test_full_name_not_ambigious(self): actual = get_task_registry().get_task_cls( "test_dbnd.task_build.test_task_registry.RAmbiguousClass") assert actual == RAmbiguousClass
def run_driver(self): logger.info("Running driver... Driver PID: %s", os.getpid()) run = self.run # type: DatabandRun settings = run.context.settings run_executor = run.run_executor remote_engine = run_executor.remote_engine settings.git.validate_git_policy() # let prepare for remote execution remote_engine.prepare_for_run(run) if self.root_task_name_to_build: logger.info("Building main task '%s'", self.root_task_name_to_build) root_task = get_task_registry().build_dbnd_task( self.root_task_name_to_build) logger.info( "Task %s has been created (%s children)", root_task.task_id, len(root_task.ctrl.task_dag.subdag_tasks()), ) run.root_task = root_task # assert that graph is DAG run.root_task.task_dag.topological_sort() # now we init all task runs for all tasks in the pipeline task_runs = self._init_task_runs_for_execution( task_engine=remote_engine) root_task_run = run.root_task_run run.root_task.ctrl.banner( "Main task '%s' has been created!" % root_task_run.task_af_id, color="cyan", task_run=root_task_run, ) if self.run_config.dry: run.root_task.ctrl.describe_dag.describe_dag() logger.warning( "Execution has been stopped due to run.dry=True flag!") return run print_tasks_tree(root_task_run.task, task_runs) if self._is_save_run_pickle(task_runs, remote_engine): run_executor.save_run_pickle() task_runs_to_run = [tr for tr in task_runs if not tr.is_skipped] # THIS IS THE POINT WHEN WE SUBMIT ALL TASKS TO EXECUTION # we should make sure that we create executor without driver task task_executor = get_task_executor( run, task_executor_type=run_executor.task_executor_type, host_engine=run_executor.host_engine, target_engine=remote_engine, task_runs=task_runs_to_run, ) hearbeat = None if self.send_heartbeat: # this will wrap the executor with "heartbeat" process hearbeat = start_heartbeat_sender(self) with nested(hearbeat): task_executor.do_run() # if we are in the driver, we want to print banner after executor__task banner run.set_run_state(RunState.SUCCESS) logger.info(run.describe.run_banner_for_finished()) return run