def set_tracking_context(): try: reset_dbnd_project_config() get_dbnd_project_config()._dbnd_tracking = True yield finally: reset_dbnd_project_config()
def set_context(self, ti): """ Airflow's log handler use this method to setup the context when running a TaskInstance(=ti). We use this method to setup the dbnd context and communicate information to the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`. """ # we setting up only when we are not in our own orchestration dag if ti.dag_id.startswith(AD_HOC_DAG_PREFIX): return if config.getboolean("mlflow_tracking", "databand_tracking"): self.airflow_logger.warning( "dbnd can't track mlflow and airflow together please disable dbnd config " "`databand_tracking` in section `mlflow_tracking`") return # we are not tracking SubDagOperator if ti.operator == SubDagOperator.__name__: return task_key = calc_task_run_attempt_key_from_af_ti(ti) env_attempt_uid = os.environ.get(task_key) # This key is already set which means we are in --raw run if env_attempt_uid: # no need for further actions inside --raw run return # communicate the task_run_attempt_uid to inner processes # will be used for the task_run of `<airflow_operator>_execute` task self.task_run_attempt_uid = get_uuid() self.task_env_key = task_key os.environ[self.task_env_key] = str(self.task_run_attempt_uid) # airflow calculation for the relevant log_file log_relative_path = self.log_file_name_factory(ti, ti.try_number) self.log_file = os.path.join(self.airflow_base_log_dir, log_relative_path) # make sure we are not polluting the airflow logs get_dbnd_project_config().quiet_mode = True # tracking msg self.airflow_logger.info( "Tracked by Databand {version}".format(version=dbnd.__version__)) # context with disabled logs self.dbnd_context_manage = new_dbnd_context( conf={"log": { "disabled": True }}) self.dbnd_context = self.dbnd_context_manage.__enter__()
from pytest import fixture # import dbnd should be first! import dbnd import dbnd._core.utils.basics.environ_utils from dbnd import get_dbnd_project_config, register_config_cls, register_task from dbnd._core.configuration.environ_config import reset_dbnd_project_config from dbnd._core.plugin.dbnd_plugins import disable_airflow_plugin from dbnd.testing.test_config_setter import add_test_configuration from dbnd_test_scenarios.test_common.task.factories import FooConfig, TConfig from targets import target # we want to test only this module get_dbnd_project_config().is_no_modules = True # if enabled will pring much better info on tests # os.environ["DBND__VERBOSE"] = "True" # DISABLE AIRFLOW, we don't test it in this module! disable_airflow_plugin() pytest_plugins = [ "dbnd.testing.pytest_dbnd_plugin", "dbnd.testing.pytest_dbnd_markers_plugin", "dbnd.testing.pytest_dbnd_home_plugin", ] __all__ = ["dbnd"] try: import matplotlib
def set_context(self, ti): """ Airflow's log handler use this method to setup the context when running a TaskInstance(=ti). We use this method to setup the dbnd context and communicate information to the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`. """ # we setting up only when we are not in our own orchestration dag if ti.dag_id.startswith(AD_HOC_DAG_PREFIX): return if not is_dag_eligable_for_tracking(ti.dag_id): return if config.getboolean("mlflow_tracking", "databand_tracking"): self.airflow_logger.warning( "dbnd can't track mlflow and airflow together please disable dbnd config " "`databand_tracking` in section `mlflow_tracking`") return # we are not tracking SubDagOperator if ti.operator is None or ti.operator == SubDagOperator.__name__: return # Airflow is running with two process `run` and `--raw run`. # But we want the handler to run only once (Idempotency) # So we are using an environment variable to sync those two process task_key = calc_task_key_from_af_ti(ti) if os.environ.get(task_key, False): # This key is already set which means we are in `--raw run` return else: # We are in the outer `run` self.task_env_key = task_key # marking the environment with the current key for the environ_utils.set_on(task_key) from dbnd_airflow.tracking.dbnd_airflow_conf import ( set_dbnd_config_from_airflow_connections, ) # When we are in `--raw run`, in tracking, it runs the main airflow process # for every task, which made some of the features to run twice, # once when the `worker` process ran, and once when the `main` one ran, # which made some of the features to run with different configurations. # it still runs twice, but know with the same configurations. set_dbnd_config_from_airflow_connections() self.task_run_attempt_uid = get_task_run_attempt_uid_from_af_ti(ti) # airflow calculation for the relevant log_file log_relative_path = self.log_file_name_factory(ti, ti.try_number) self.log_file = os.path.join(self.airflow_base_log_dir, log_relative_path) # make sure we are not polluting the airflow logs get_dbnd_project_config().quiet_mode = True # tracking msg self.airflow_logger.info("Databand Tracking Started {version}".format( version=dbnd.__version__)) # context with disabled logs self.dbnd_context_manage = new_dbnd_context( conf={"log": { "disabled": True }}) self.dbnd_context = self.dbnd_context_manage.__enter__()
def new_execute(context): """ This function replaces the operator's original `execute` function """ # IMPORTANT!!: copied_operator: # --------------------------------------- # The task (=operator) is copied when airflow enters to TaskInstance._run_raw_task. # Then, only the copy_task (=copy_operator) is changed or called (render jinja, signal_handler, # pre_execute, execute, etc..). copied_operator = context["task_instance"].task if not is_dag_eligable_for_tracking(context["task_instance"].dag_id): execute = get_execute_function(copied_operator) result = execute(copied_operator, context) return result try: # Set that we are in Airflow tracking mode get_dbnd_project_config().set_is_airflow_runtime() task_context = extract_airflow_context(context) # start operator execute run with current airflow context task_run = dbnd_airflow_tracking_start( airflow_context=task_context) # type: Optional[TaskRun] except Exception as e: task_run = None logger.error( "exception caught while running on dbnd new execute {}".format(e), exc_info=True, ) from airflow.exceptions import AirflowRescheduleException # running the operator's original execute function try: with af_tracking_context(task_run, context, copied_operator): execute = get_execute_function(copied_operator) result = execute(copied_operator, context) # Check if this is sensor task that is retrying - normal behavior and not really an exception except AirflowRescheduleException: dbnd_tracking_stop(finalize_run=False) raise # catch if the original execute failed except Exception as ex: if task_run: error = TaskRunError.build_from_ex(ex, task_run) task_run.set_task_run_state(state=TaskRunState.FAILED, error=error) dbnd_tracking_stop() raise # if we have a task run here we want to log results and xcoms if task_run: try: track_config = AirflowTrackingConfig.from_databand_context() if track_config.track_xcom_values: # reporting xcoms as metrix of the task log_xcom(context, track_config) if track_config.track_airflow_execute_result: # reporting the result log_operator_result(task_run, result, copied_operator, track_config.track_xcom_values) except Exception as e: logger.error( "exception caught will tracking airflow operator {}".format(e), exc_info=True, ) # make sure we close and return the original results dbnd_tracking_stop() return result
from airflow.plugins_manager import AirflowPlugin from dbnd import get_dbnd_project_config from dbnd_airflow.tracking.airflow_patching import ( add_tracking_to_policy, patch_airflow_context_vars, ) config = get_dbnd_project_config() if config.airflow_auto_tracking: add_tracking_to_policy() patch_airflow_context_vars() class DbndAutoTracking(AirflowPlugin): name = "dbnd_airflow_auto_tracking"