Esempio n. 1
0
def set_tracking_context():
    try:
        reset_dbnd_project_config()
        get_dbnd_project_config()._dbnd_tracking = True
        yield
    finally:
        reset_dbnd_project_config()
Esempio n. 2
0
    def set_context(self, ti):
        """
        Airflow's log handler use this method to setup the context when running a TaskInstance(=ti).
        We use this method to setup the dbnd context and communicate information to
        the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`.
        """
        # we setting up only when we are not in our own orchestration dag
        if ti.dag_id.startswith(AD_HOC_DAG_PREFIX):
            return

        if config.getboolean("mlflow_tracking", "databand_tracking"):
            self.airflow_logger.warning(
                "dbnd can't track mlflow and airflow together please disable dbnd config "
                "`databand_tracking` in section `mlflow_tracking`")
            return

        # we are not tracking SubDagOperator
        if ti.operator == SubDagOperator.__name__:
            return

        task_key = calc_task_run_attempt_key_from_af_ti(ti)
        env_attempt_uid = os.environ.get(task_key)

        # This key is already set which means we are in --raw run
        if env_attempt_uid:
            # no need for further actions inside --raw run
            return

        # communicate the task_run_attempt_uid to inner processes
        # will be used for the task_run of `<airflow_operator>_execute` task
        self.task_run_attempt_uid = get_uuid()
        self.task_env_key = task_key
        os.environ[self.task_env_key] = str(self.task_run_attempt_uid)

        # airflow calculation for the relevant log_file
        log_relative_path = self.log_file_name_factory(ti, ti.try_number)
        self.log_file = os.path.join(self.airflow_base_log_dir,
                                     log_relative_path)

        # make sure we are not polluting the airflow logs
        get_dbnd_project_config().quiet_mode = True

        # tracking msg
        self.airflow_logger.info(
            "Tracked by Databand {version}".format(version=dbnd.__version__))

        # context with disabled logs
        self.dbnd_context_manage = new_dbnd_context(
            conf={"log": {
                "disabled": True
            }})
        self.dbnd_context = self.dbnd_context_manage.__enter__()
Esempio n. 3
0
from pytest import fixture

# import dbnd should be first!
import dbnd
import dbnd._core.utils.basics.environ_utils

from dbnd import get_dbnd_project_config, register_config_cls, register_task
from dbnd._core.configuration.environ_config import reset_dbnd_project_config
from dbnd._core.plugin.dbnd_plugins import disable_airflow_plugin
from dbnd.testing.test_config_setter import add_test_configuration
from dbnd_test_scenarios.test_common.task.factories import FooConfig, TConfig
from targets import target

# we want to test only this module
get_dbnd_project_config().is_no_modules = True

# if enabled will pring much better info on tests
# os.environ["DBND__VERBOSE"] = "True"

# DISABLE AIRFLOW, we don't test it in this module!
disable_airflow_plugin()
pytest_plugins = [
    "dbnd.testing.pytest_dbnd_plugin",
    "dbnd.testing.pytest_dbnd_markers_plugin",
    "dbnd.testing.pytest_dbnd_home_plugin",
]
__all__ = ["dbnd"]

try:
    import matplotlib
Esempio n. 4
0
    def set_context(self, ti):
        """
        Airflow's log handler use this method to setup the context when running a TaskInstance(=ti).
        We use this method to setup the dbnd context and communicate information to
        the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`.
        """
        # we setting up only when we are not in our own orchestration dag
        if ti.dag_id.startswith(AD_HOC_DAG_PREFIX):
            return

        if not is_dag_eligable_for_tracking(ti.dag_id):
            return

        if config.getboolean("mlflow_tracking", "databand_tracking"):
            self.airflow_logger.warning(
                "dbnd can't track mlflow and airflow together please disable dbnd config "
                "`databand_tracking` in section `mlflow_tracking`")
            return

        # we are not tracking SubDagOperator
        if ti.operator is None or ti.operator == SubDagOperator.__name__:
            return

        # Airflow is running with two process `run` and `--raw run`.
        # But we want the handler to run only once (Idempotency)
        # So we are using an environment variable to sync those two process
        task_key = calc_task_key_from_af_ti(ti)
        if os.environ.get(task_key, False):
            # This key is already set which means we are in `--raw run`
            return
        else:
            # We are in the outer `run`
            self.task_env_key = task_key
            # marking the environment with the current key for the
            environ_utils.set_on(task_key)
            from dbnd_airflow.tracking.dbnd_airflow_conf import (
                set_dbnd_config_from_airflow_connections, )

            # When we are in `--raw run`, in tracking, it runs the main airflow process
            # for every task, which made some of the features to run twice,
            # once when the `worker` process ran, and once when the `main` one ran,
            # which made some of the features to run with different configurations.
            # it still runs twice, but know with the same configurations.
            set_dbnd_config_from_airflow_connections()

        self.task_run_attempt_uid = get_task_run_attempt_uid_from_af_ti(ti)

        # airflow calculation for the relevant log_file
        log_relative_path = self.log_file_name_factory(ti, ti.try_number)
        self.log_file = os.path.join(self.airflow_base_log_dir,
                                     log_relative_path)

        # make sure we are not polluting the airflow logs
        get_dbnd_project_config().quiet_mode = True

        # tracking msg
        self.airflow_logger.info("Databand Tracking Started {version}".format(
            version=dbnd.__version__))

        # context with disabled logs
        self.dbnd_context_manage = new_dbnd_context(
            conf={"log": {
                "disabled": True
            }})
        self.dbnd_context = self.dbnd_context_manage.__enter__()
Esempio n. 5
0
def new_execute(context):
    """
    This function replaces the operator's original `execute` function
    """
    # IMPORTANT!!: copied_operator:
    # ---------------------------------------
    # The task (=operator) is copied when airflow enters to TaskInstance._run_raw_task.
    # Then, only the copy_task (=copy_operator) is changed or called (render jinja, signal_handler,
    # pre_execute, execute, etc..).
    copied_operator = context["task_instance"].task

    if not is_dag_eligable_for_tracking(context["task_instance"].dag_id):
        execute = get_execute_function(copied_operator)
        result = execute(copied_operator, context)
        return result

    try:
        # Set that we are in Airflow tracking mode
        get_dbnd_project_config().set_is_airflow_runtime()

        task_context = extract_airflow_context(context)
        # start operator execute run with current airflow context
        task_run = dbnd_airflow_tracking_start(
            airflow_context=task_context)  # type: Optional[TaskRun]

    except Exception as e:
        task_run = None
        logger.error(
            "exception caught while running on dbnd new execute {}".format(e),
            exc_info=True,
        )

    from airflow.exceptions import AirflowRescheduleException

    # running the operator's original execute function
    try:
        with af_tracking_context(task_run, context, copied_operator):
            execute = get_execute_function(copied_operator)
            result = execute(copied_operator, context)

    # Check if this is sensor task that is retrying - normal behavior and not really an exception
    except AirflowRescheduleException:
        dbnd_tracking_stop(finalize_run=False)
        raise
    # catch if the original execute failed
    except Exception as ex:
        if task_run:
            error = TaskRunError.build_from_ex(ex, task_run)
            task_run.set_task_run_state(state=TaskRunState.FAILED, error=error)

        dbnd_tracking_stop()
        raise

    # if we have a task run here we want to log results and xcoms
    if task_run:
        try:
            track_config = AirflowTrackingConfig.from_databand_context()
            if track_config.track_xcom_values:
                # reporting xcoms as metrix of the task
                log_xcom(context, track_config)

            if track_config.track_airflow_execute_result:
                # reporting the result
                log_operator_result(task_run, result, copied_operator,
                                    track_config.track_xcom_values)

        except Exception as e:
            logger.error(
                "exception caught will tracking airflow operator {}".format(e),
                exc_info=True,
            )

    # make sure we close and return the original results
    dbnd_tracking_stop()
    return result
Esempio n. 6
0
from airflow.plugins_manager import AirflowPlugin

from dbnd import get_dbnd_project_config
from dbnd_airflow.tracking.airflow_patching import (
    add_tracking_to_policy,
    patch_airflow_context_vars,
)

config = get_dbnd_project_config()
if config.airflow_auto_tracking:
    add_tracking_to_policy()
    patch_airflow_context_vars()


class DbndAutoTracking(AirflowPlugin):
    name = "dbnd_airflow_auto_tracking"