def dbnd_on_pre_init_context(ctx): from mlflow import get_tracking_uri, set_tracking_uri if not config.getboolean("mlflow_tracking", "databand_tracking"): return databand_url = config.get("core", "databand_url") if not databand_url: logger.info( "Although 'databand_tracking' was set in 'mlflow_tracking', " "dbnd will not use it since 'core.databand_url' was not set." ) return duplicate_tracking_to = config.get("mlflow_tracking", "duplicate_tracking_to") if not duplicate_tracking_to: duplicate_tracking_to = get_tracking_uri() # check if dbnd store uri was already defined with MLFlow config if is_composite_uri(duplicate_tracking_to): raise DatabandConfigError( "Config conflict: MLFlow and DBND configs both define dbnd store uri" ) composite_uri = build_composite_uri(databand_url, duplicate_tracking_to) global _original_mlflow_tracking_uri _original_mlflow_tracking_uri = get_tracking_uri() set_tracking_uri(composite_uri)
def dbnd_on_exit_context(ctx): if not config.getboolean("mlflow_tracking", "databand_tracking"): return from mlflow import set_tracking_uri global _original_mlflow_tracking_uri set_tracking_uri(_original_mlflow_tracking_uri)
def cli(): dbnd_bootstrap() from dbnd import config # if we are running from "dbnd" entrypoint, we probably do not need to load Scheduled DAG # this will prevent from every airflow command to access dbnd web api if config.getboolean("airflow", "auto_disable_scheduled_dags_load"): os.environ["DBND_DISABLE_SCHEDULED_DAGS_LOAD"] = "True"
def set_context(self, ti): """ Airflow's log handler use this method to setup the context when running a TaskInstance(=ti). We use this method to setup the dbnd context and communicate information to the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`. """ # we setting up only when we are not in our own orchestration dag if ti.dag_id.startswith(AD_HOC_DAG_PREFIX): return if config.getboolean("mlflow_tracking", "databand_tracking"): self.airflow_logger.warning( "dbnd can't track mlflow and airflow together please disable dbnd config " "`databand_tracking` in section `mlflow_tracking`") return # we are not tracking SubDagOperator if ti.operator == SubDagOperator.__name__: return task_key = calc_task_run_attempt_key_from_af_ti(ti) env_attempt_uid = os.environ.get(task_key) # This key is already set which means we are in --raw run if env_attempt_uid: # no need for further actions inside --raw run return # communicate the task_run_attempt_uid to inner processes # will be used for the task_run of `<airflow_operator>_execute` task self.task_run_attempt_uid = get_uuid() self.task_env_key = task_key os.environ[self.task_env_key] = str(self.task_run_attempt_uid) # airflow calculation for the relevant log_file log_relative_path = self.log_file_name_factory(ti, ti.try_number) self.log_file = os.path.join(self.airflow_base_log_dir, log_relative_path) # make sure we are not polluting the airflow logs get_dbnd_project_config().quiet_mode = True # tracking msg self.airflow_logger.info( "Tracked by Databand {version}".format(version=dbnd.__version__)) # context with disabled logs self.dbnd_context_manage = new_dbnd_context( conf={"log": { "disabled": True }}) self.dbnd_context = self.dbnd_context_manage.__enter__()
def run( ctx, is_help, task, module, _sets, _sets_config, _sets_root, _overrides, verbose, describe, env, parallel, conf_file, task_version, project_name, name, description, run_driver, alternative_task_name, scheduled_job_name, scheduled_date, interactive, submit_driver, submit_tasks, disable_web_tracker, ): """ Run a task or a DAG To see tasks use `dbnd show-tasks` (tab completion is available). """ from dbnd._core.context.databand_context import new_dbnd_context, DatabandContext from dbnd._core.utils.structures import combine_mappings from dbnd import config task_name = task # --verbose, --describe, --env, --parallel, --conf-file and --project-name # we filter out false flags since otherwise they will always override the config with their falseness main_switches = dict( databand=filter_dict_remove_false_values( dict( verbose=verbose > 0, describe=describe, env=env, conf_file=conf_file, project_name=project_name, ) ), run=filter_dict_remove_false_values( dict( name=name, parallel=parallel, description=description, is_archived=describe, ) ), ) if submit_driver is not None: main_switches["run"]["submit_driver"] = bool(submit_driver) if submit_tasks is not None: main_switches["run"]["submit_tasks"] = bool(submit_tasks) if disable_web_tracker: main_switches.setdefault("core", {})["tracker_api"] = "disabled" if task_version is not None: main_switches["task"] = {"task_version": task_version} cmd_line_config = parse_and_build_config_store( source="cli", config_values=main_switches ) _sets = list(_sets) _sets_config = list(_sets_config) _sets_root = list(_sets_root) root_task_config = {} for _set in _sets_root: root_task_config = combine_mappings(left=root_task_config, right=_set) # remove all "first level" config values, assume that they are for the main task # add them to _sets_root for _set in _sets: for k, v in list(_set.items()): # so json-like values won't be included if "." not in k and isinstance(v, six.string_types): root_task_config[k] = v del _set[k] # --set, --set-config if _sets: cmd_line_config.update(_parse_cli(_sets, source="--set")) if _sets_config: cmd_line_config.update(_parse_cli(_sets_config, source="--set-config")) if _overrides: cmd_line_config.update( _parse_cli(_overrides, source="--set-override", override=True) ) if interactive: cmd_line_config.update( _parse_cli([{"run.interactive": True}], source="--interactive") ) if verbose > 1: cmd_line_config.update( _parse_cli([{"task_build.verbose": True}], source="-v -v") ) if cmd_line_config: config.set_values(cmd_line_config, source="cmdline") if verbose: logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config)) # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() if not config.getboolean("log", "disabled"): configure_basic_logging(None) scheduled_run_info = None if scheduled_job_name: scheduled_run_info = ScheduledRunInfo( scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date ) with new_dbnd_context( name="run", module=module ) as context: # type: DatabandContext task_registry = get_task_registry() tasks = task_registry.list_dbnd_task_classes() completer.refresh(tasks) # modules are loaded, we can load the task task_cls = None if task_name: task_cls = task_registry.get_task_cls(task_name) if alternative_task_name: task_cls = build_dynamic_task( original_cls=task_cls, new_cls_name=alternative_task_name ) task_name = alternative_task_name # --set-root # now we can get it config, as it's not main task, we can load config after the configuration is loaded if task_cls is not None: if root_task_config: # adding root task to configuration config.set_values( {task_cls.task_definition.task_config_section: root_task_config}, source="--set-root", ) if is_help or not task_name: print_help(ctx, task_cls) return return context.dbnd_run_task( task_or_task_name=task_name, run_uid=run_driver, scheduled_run_info=scheduled_run_info, )
def cmd_run( ctx, is_help, task, module, _sets, _sets_config, _sets_root, _overrides, _extend, verbose, print_task_band, describe, env, parallel, conf_file, task_version, project, name, description, run_driver, override_run_uid, alternative_task_name, job_name, scheduled_job_name, scheduled_date, interactive, submit_driver, submit_tasks, disable_web_tracker, open_web_tab, docker_build_tag, ): """ Run a task or a DAG To see all available tasks use `dbnd show-tasks` (tab completion is available). `dbnd show-configs` will print all available configs. """ from dbnd import config from dbnd._core.context.databand_context import DatabandContext, new_dbnd_context from dbnd._core.utils.structures import combine_mappings task_registry = get_task_registry() # we need to do it before we are looking for the task cls load_user_modules(dbnd_config=config, modules=module) task_name = task # --verbose, --describe, --env, --parallel, --conf-file and --project # we filter out false flags since otherwise they will always override the config with their falseness main_switches = dict( databand=dict( verbose=verbose > 0, print_task_band=print_task_band, describe=describe, env=env, conf_file=conf_file, project=project, ), run=dict( name=name, parallel=parallel, interactive=interactive, description=description, is_archived=describe, open_web_tracker_in_browser=open_web_tab, submit_driver=_nullable_flag(submit_driver), submit_tasks=_nullable_flag(submit_tasks), ), kubernetes=dict(docker_build_tag=docker_build_tag), task=dict(task_version=task_version), task_build=dict(verbose=True if verbose > 1 else None), core=dict(tracker_api="disabled" if disable_web_tracker else None), ) main_switches = cleanup_empty_switches(main_switches) _sets = list(_sets) _sets_config = list(_sets_config) _sets_root = list(_sets_root) root_task_config = {} for _set in _sets_root: root_task_config = combine_mappings(left=root_task_config, right=_set) # remove all "first level" config values, assume that they are for the main task # add them to _sets_root for _set in _sets: for k, v in list(_set.items()): # so json-like values won't be included if "." not in k and isinstance(v, six.string_types): root_task_config[k] = v del _set[k] cmd_line_config = parse_and_build_config_store(source="cli", config_values=main_switches) # --set, --set-config if _sets: cmd_line_config.update(_parse_cli(_sets, source="--set")) if _sets_config: cmd_line_config.update(_parse_cli(_sets_config, source="--set-config")) if _extend: cmd_line_config.update( _parse_cli(_extend, source="--extend-config", extend=True)) if _overrides: cmd_line_config.update( _parse_cli( _overrides, source="--set-override", priority=ConfigValuePriority.OVERRIDE, )) # --set-root if root_task_config: task_cls = task_registry.get_task_cls(task_name) task_section = task_cls.task_definition.task_config_section # adding root task to configuration cmd_line_config.update( parse_and_build_config_store( config_values={task_section: root_task_config}, source="--set-root")) # UPDATE CURRENT CONFIG with CLI values if cmd_line_config: if verbose: logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config)) config.set_values(cmd_line_config, source="cmdline") # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() # initialize basic logging (until we get to the context logging if not config.getboolean("log", "disabled"): configure_basic_logging(None) scheduled_run_info = None if scheduled_job_name: scheduled_run_info = ScheduledRunInfo( scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date) # update completer if config.getboolean("databand", "completer"): tasks = task_registry.list_dbnd_task_classes() completer.refresh(tasks) # bootstrap and modules are loaded, we can load the task task_cls = None if task_name: task_cls = task_registry.get_task_cls(task_name) if not task_name: print_help(ctx, None) return if is_help: print_help(ctx, task_cls) return with tracking_mode_context(tracking=False), new_dbnd_context( name="run") as context: # type: DatabandContext if context.settings.system.describe: # we want to print describe without triggering real run logger.info("Building main task '%s'", task_name) root_task = get_task_registry().build_dbnd_task(task_name) root_task.ctrl.describe_dag.describe_dag() # currently there is bug with the click version we have when using python 2 # so we don't use the click.echo function # https://github.com/pallets/click/issues/564 print("Task %s has been described!" % task_name) return root_task return context.dbnd_run_task( task_or_task_name=task_name, force_task_name=alternative_task_name, job_name=job_name or alternative_task_name or task_name, run_uid=run_driver or override_run_uid, existing_run=run_driver is not None, scheduled_run_info=scheduled_run_info, project=project, )
def set_context(self, ti): """ Airflow's log handler use this method to setup the context when running a TaskInstance(=ti). We use this method to setup the dbnd context and communicate information to the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`. """ # we setting up only when we are not in our own orchestration dag if ti.dag_id.startswith(AD_HOC_DAG_PREFIX): return if not is_dag_eligable_for_tracking(ti.dag_id): return if config.getboolean("mlflow_tracking", "databand_tracking"): self.airflow_logger.warning( "dbnd can't track mlflow and airflow together please disable dbnd config " "`databand_tracking` in section `mlflow_tracking`") return # we are not tracking SubDagOperator if ti.operator is None or ti.operator == SubDagOperator.__name__: return # Airflow is running with two process `run` and `--raw run`. # But we want the handler to run only once (Idempotency) # So we are using an environment variable to sync those two process task_key = calc_task_key_from_af_ti(ti) if os.environ.get(task_key, False): # This key is already set which means we are in `--raw run` return else: # We are in the outer `run` self.task_env_key = task_key # marking the environment with the current key for the environ_utils.set_on(task_key) from dbnd_airflow.tracking.dbnd_airflow_conf import ( set_dbnd_config_from_airflow_connections, ) # When we are in `--raw run`, in tracking, it runs the main airflow process # for every task, which made some of the features to run twice, # once when the `worker` process ran, and once when the `main` one ran, # which made some of the features to run with different configurations. # it still runs twice, but know with the same configurations. set_dbnd_config_from_airflow_connections() self.task_run_attempt_uid = get_task_run_attempt_uid_from_af_ti(ti) # airflow calculation for the relevant log_file log_relative_path = self.log_file_name_factory(ti, ti.try_number) self.log_file = os.path.join(self.airflow_base_log_dir, log_relative_path) # make sure we are not polluting the airflow logs get_dbnd_project_config().quiet_mode = True # tracking msg self.airflow_logger.info("Databand Tracking Started {version}".format( version=dbnd.__version__)) # context with disabled logs self.dbnd_context_manage = new_dbnd_context( conf={"log": { "disabled": True }}) self.dbnd_context = self.dbnd_context_manage.__enter__()