Ejemplo n.º 1
0
def dbnd_on_pre_init_context(ctx):
    from mlflow import get_tracking_uri, set_tracking_uri

    if not config.getboolean("mlflow_tracking", "databand_tracking"):
        return

    databand_url = config.get("core", "databand_url")
    if not databand_url:
        logger.info(
            "Although 'databand_tracking' was set in 'mlflow_tracking', "
            "dbnd will not use it since 'core.databand_url' was not set."
        )
        return

    duplicate_tracking_to = config.get("mlflow_tracking", "duplicate_tracking_to")

    if not duplicate_tracking_to:
        duplicate_tracking_to = get_tracking_uri()

        # check if dbnd store uri was already defined with MLFlow config
        if is_composite_uri(duplicate_tracking_to):
            raise DatabandConfigError(
                "Config conflict: MLFlow and DBND configs both define dbnd store uri"
            )

    composite_uri = build_composite_uri(databand_url, duplicate_tracking_to)

    global _original_mlflow_tracking_uri
    _original_mlflow_tracking_uri = get_tracking_uri()
    set_tracking_uri(composite_uri)
Ejemplo n.º 2
0
def dbnd_on_exit_context(ctx):
    if not config.getboolean("mlflow_tracking", "databand_tracking"):
        return

    from mlflow import set_tracking_uri

    global _original_mlflow_tracking_uri
    set_tracking_uri(_original_mlflow_tracking_uri)
Ejemplo n.º 3
0
def cli():
    dbnd_bootstrap()

    from dbnd import config

    # if we are running from "dbnd" entrypoint, we probably do not need to load Scheduled DAG
    # this will prevent from every airflow command to access dbnd web api
    if config.getboolean("airflow", "auto_disable_scheduled_dags_load"):
        os.environ["DBND_DISABLE_SCHEDULED_DAGS_LOAD"] = "True"
Ejemplo n.º 4
0
    def set_context(self, ti):
        """
        Airflow's log handler use this method to setup the context when running a TaskInstance(=ti).
        We use this method to setup the dbnd context and communicate information to
        the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`.
        """
        # we setting up only when we are not in our own orchestration dag
        if ti.dag_id.startswith(AD_HOC_DAG_PREFIX):
            return

        if config.getboolean("mlflow_tracking", "databand_tracking"):
            self.airflow_logger.warning(
                "dbnd can't track mlflow and airflow together please disable dbnd config "
                "`databand_tracking` in section `mlflow_tracking`")
            return

        # we are not tracking SubDagOperator
        if ti.operator == SubDagOperator.__name__:
            return

        task_key = calc_task_run_attempt_key_from_af_ti(ti)
        env_attempt_uid = os.environ.get(task_key)

        # This key is already set which means we are in --raw run
        if env_attempt_uid:
            # no need for further actions inside --raw run
            return

        # communicate the task_run_attempt_uid to inner processes
        # will be used for the task_run of `<airflow_operator>_execute` task
        self.task_run_attempt_uid = get_uuid()
        self.task_env_key = task_key
        os.environ[self.task_env_key] = str(self.task_run_attempt_uid)

        # airflow calculation for the relevant log_file
        log_relative_path = self.log_file_name_factory(ti, ti.try_number)
        self.log_file = os.path.join(self.airflow_base_log_dir,
                                     log_relative_path)

        # make sure we are not polluting the airflow logs
        get_dbnd_project_config().quiet_mode = True

        # tracking msg
        self.airflow_logger.info(
            "Tracked by Databand {version}".format(version=dbnd.__version__))

        # context with disabled logs
        self.dbnd_context_manage = new_dbnd_context(
            conf={"log": {
                "disabled": True
            }})
        self.dbnd_context = self.dbnd_context_manage.__enter__()
Ejemplo n.º 5
0
def run(
    ctx,
    is_help,
    task,
    module,
    _sets,
    _sets_config,
    _sets_root,
    _overrides,
    verbose,
    describe,
    env,
    parallel,
    conf_file,
    task_version,
    project_name,
    name,
    description,
    run_driver,
    alternative_task_name,
    scheduled_job_name,
    scheduled_date,
    interactive,
    submit_driver,
    submit_tasks,
    disable_web_tracker,
):
    """
    Run a task or a DAG

    To see tasks use `dbnd show-tasks` (tab completion is available).
    """

    from dbnd._core.context.databand_context import new_dbnd_context, DatabandContext
    from dbnd._core.utils.structures import combine_mappings
    from dbnd import config

    task_name = task
    # --verbose, --describe, --env, --parallel, --conf-file and --project-name
    # we filter out false flags since otherwise they will always override the config with their falseness
    main_switches = dict(
        databand=filter_dict_remove_false_values(
            dict(
                verbose=verbose > 0,
                describe=describe,
                env=env,
                conf_file=conf_file,
                project_name=project_name,
            )
        ),
        run=filter_dict_remove_false_values(
            dict(
                name=name,
                parallel=parallel,
                description=description,
                is_archived=describe,
            )
        ),
    )

    if submit_driver is not None:
        main_switches["run"]["submit_driver"] = bool(submit_driver)
    if submit_tasks is not None:
        main_switches["run"]["submit_tasks"] = bool(submit_tasks)
    if disable_web_tracker:
        main_switches.setdefault("core", {})["tracker_api"] = "disabled"

    if task_version is not None:
        main_switches["task"] = {"task_version": task_version}

    cmd_line_config = parse_and_build_config_store(
        source="cli", config_values=main_switches
    )

    _sets = list(_sets)
    _sets_config = list(_sets_config)
    _sets_root = list(_sets_root)

    root_task_config = {}
    for _set in _sets_root:
        root_task_config = combine_mappings(left=root_task_config, right=_set)

    # remove all "first level" config values, assume that they are for the main task
    # add them to _sets_root
    for _set in _sets:
        for k, v in list(_set.items()):
            # so json-like values won't be included
            if "." not in k and isinstance(v, six.string_types):
                root_task_config[k] = v
                del _set[k]

    # --set, --set-config
    if _sets:
        cmd_line_config.update(_parse_cli(_sets, source="--set"))
    if _sets_config:
        cmd_line_config.update(_parse_cli(_sets_config, source="--set-config"))
    if _overrides:
        cmd_line_config.update(
            _parse_cli(_overrides, source="--set-override", override=True)
        )
    if interactive:
        cmd_line_config.update(
            _parse_cli([{"run.interactive": True}], source="--interactive")
        )
    if verbose > 1:
        cmd_line_config.update(
            _parse_cli([{"task_build.verbose": True}], source="-v -v")
        )

    if cmd_line_config:
        config.set_values(cmd_line_config, source="cmdline")
    if verbose:
        logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config))

    # double checking on bootstrap, as we can run from all kind of locations
    # usually we should be bootstraped already as we run from cli.
    dbnd_bootstrap()
    if not config.getboolean("log", "disabled"):
        configure_basic_logging(None)

    scheduled_run_info = None
    if scheduled_job_name:
        scheduled_run_info = ScheduledRunInfo(
            scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date
        )

    with new_dbnd_context(
        name="run", module=module
    ) as context:  # type: DatabandContext
        task_registry = get_task_registry()

        tasks = task_registry.list_dbnd_task_classes()
        completer.refresh(tasks)

        # modules are loaded, we can load the task
        task_cls = None
        if task_name:
            task_cls = task_registry.get_task_cls(task_name)
            if alternative_task_name:
                task_cls = build_dynamic_task(
                    original_cls=task_cls, new_cls_name=alternative_task_name
                )
                task_name = alternative_task_name

        # --set-root
        # now we can get it config, as it's not main task, we can load config after the configuration is loaded
        if task_cls is not None:
            if root_task_config:
                # adding root task to configuration
                config.set_values(
                    {task_cls.task_definition.task_config_section: root_task_config},
                    source="--set-root",
                )

        if is_help or not task_name:
            print_help(ctx, task_cls)
            return

        return context.dbnd_run_task(
            task_or_task_name=task_name,
            run_uid=run_driver,
            scheduled_run_info=scheduled_run_info,
        )
Ejemplo n.º 6
0
def cmd_run(
    ctx,
    is_help,
    task,
    module,
    _sets,
    _sets_config,
    _sets_root,
    _overrides,
    _extend,
    verbose,
    print_task_band,
    describe,
    env,
    parallel,
    conf_file,
    task_version,
    project,
    name,
    description,
    run_driver,
    override_run_uid,
    alternative_task_name,
    job_name,
    scheduled_job_name,
    scheduled_date,
    interactive,
    submit_driver,
    submit_tasks,
    disable_web_tracker,
    open_web_tab,
    docker_build_tag,
):
    """
    Run a task or a DAG

    To see all available tasks use `dbnd show-tasks` (tab completion is available).
    `dbnd show-configs` will print all available configs.
    """

    from dbnd import config
    from dbnd._core.context.databand_context import DatabandContext, new_dbnd_context
    from dbnd._core.utils.structures import combine_mappings

    task_registry = get_task_registry()

    # we need to do it before we are looking for the task cls
    load_user_modules(dbnd_config=config, modules=module)

    task_name = task
    # --verbose, --describe, --env, --parallel, --conf-file and --project
    # we filter out false flags since otherwise they will always override the config with their falseness
    main_switches = dict(
        databand=dict(
            verbose=verbose > 0,
            print_task_band=print_task_band,
            describe=describe,
            env=env,
            conf_file=conf_file,
            project=project,
        ),
        run=dict(
            name=name,
            parallel=parallel,
            interactive=interactive,
            description=description,
            is_archived=describe,
            open_web_tracker_in_browser=open_web_tab,
            submit_driver=_nullable_flag(submit_driver),
            submit_tasks=_nullable_flag(submit_tasks),
        ),
        kubernetes=dict(docker_build_tag=docker_build_tag),
        task=dict(task_version=task_version),
        task_build=dict(verbose=True if verbose > 1 else None),
        core=dict(tracker_api="disabled" if disable_web_tracker else None),
    )

    main_switches = cleanup_empty_switches(main_switches)

    _sets = list(_sets)
    _sets_config = list(_sets_config)
    _sets_root = list(_sets_root)

    root_task_config = {}
    for _set in _sets_root:
        root_task_config = combine_mappings(left=root_task_config, right=_set)

    # remove all "first level" config values, assume that they are for the main task
    # add them to _sets_root
    for _set in _sets:
        for k, v in list(_set.items()):
            # so json-like values won't be included
            if "." not in k and isinstance(v, six.string_types):
                root_task_config[k] = v
                del _set[k]

    cmd_line_config = parse_and_build_config_store(source="cli",
                                                   config_values=main_switches)
    # --set, --set-config
    if _sets:
        cmd_line_config.update(_parse_cli(_sets, source="--set"))
    if _sets_config:
        cmd_line_config.update(_parse_cli(_sets_config, source="--set-config"))
    if _extend:
        cmd_line_config.update(
            _parse_cli(_extend, source="--extend-config", extend=True))
    if _overrides:
        cmd_line_config.update(
            _parse_cli(
                _overrides,
                source="--set-override",
                priority=ConfigValuePriority.OVERRIDE,
            ))

    # --set-root
    if root_task_config:
        task_cls = task_registry.get_task_cls(task_name)
        task_section = task_cls.task_definition.task_config_section
        # adding root task to configuration
        cmd_line_config.update(
            parse_and_build_config_store(
                config_values={task_section: root_task_config},
                source="--set-root"))

    # UPDATE CURRENT CONFIG with CLI values
    if cmd_line_config:
        if verbose:
            logger.info("CLI config: \n%s",
                        pformat_config_store_as_table(cmd_line_config))
        config.set_values(cmd_line_config, source="cmdline")

    # double checking on bootstrap, as we can run from all kind of locations
    # usually we should be bootstraped already as we run from cli.
    dbnd_bootstrap()

    # initialize basic logging (until we get to the context logging
    if not config.getboolean("log", "disabled"):
        configure_basic_logging(None)

    scheduled_run_info = None
    if scheduled_job_name:
        scheduled_run_info = ScheduledRunInfo(
            scheduled_job_name=scheduled_job_name,
            scheduled_date=scheduled_date)

    # update completer
    if config.getboolean("databand", "completer"):
        tasks = task_registry.list_dbnd_task_classes()
        completer.refresh(tasks)

    # bootstrap and modules are loaded, we can load the task
    task_cls = None
    if task_name:
        task_cls = task_registry.get_task_cls(task_name)

    if not task_name:
        print_help(ctx, None)
        return

    if is_help:
        print_help(ctx, task_cls)
        return

    with tracking_mode_context(tracking=False), new_dbnd_context(
            name="run") as context:  # type: DatabandContext
        if context.settings.system.describe:
            # we want to print describe without triggering real run
            logger.info("Building main task '%s'", task_name)
            root_task = get_task_registry().build_dbnd_task(task_name)
            root_task.ctrl.describe_dag.describe_dag()
            # currently there is bug with the click version we have when using python 2
            # so we don't use the click.echo function
            # https://github.com/pallets/click/issues/564
            print("Task %s has been described!" % task_name)
            return root_task
        return context.dbnd_run_task(
            task_or_task_name=task_name,
            force_task_name=alternative_task_name,
            job_name=job_name or alternative_task_name or task_name,
            run_uid=run_driver or override_run_uid,
            existing_run=run_driver is not None,
            scheduled_run_info=scheduled_run_info,
            project=project,
        )
Ejemplo n.º 7
0
    def set_context(self, ti):
        """
        Airflow's log handler use this method to setup the context when running a TaskInstance(=ti).
        We use this method to setup the dbnd context and communicate information to
        the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`.
        """
        # we setting up only when we are not in our own orchestration dag
        if ti.dag_id.startswith(AD_HOC_DAG_PREFIX):
            return

        if not is_dag_eligable_for_tracking(ti.dag_id):
            return

        if config.getboolean("mlflow_tracking", "databand_tracking"):
            self.airflow_logger.warning(
                "dbnd can't track mlflow and airflow together please disable dbnd config "
                "`databand_tracking` in section `mlflow_tracking`")
            return

        # we are not tracking SubDagOperator
        if ti.operator is None or ti.operator == SubDagOperator.__name__:
            return

        # Airflow is running with two process `run` and `--raw run`.
        # But we want the handler to run only once (Idempotency)
        # So we are using an environment variable to sync those two process
        task_key = calc_task_key_from_af_ti(ti)
        if os.environ.get(task_key, False):
            # This key is already set which means we are in `--raw run`
            return
        else:
            # We are in the outer `run`
            self.task_env_key = task_key
            # marking the environment with the current key for the
            environ_utils.set_on(task_key)
            from dbnd_airflow.tracking.dbnd_airflow_conf import (
                set_dbnd_config_from_airflow_connections, )

            # When we are in `--raw run`, in tracking, it runs the main airflow process
            # for every task, which made some of the features to run twice,
            # once when the `worker` process ran, and once when the `main` one ran,
            # which made some of the features to run with different configurations.
            # it still runs twice, but know with the same configurations.
            set_dbnd_config_from_airflow_connections()

        self.task_run_attempt_uid = get_task_run_attempt_uid_from_af_ti(ti)

        # airflow calculation for the relevant log_file
        log_relative_path = self.log_file_name_factory(ti, ti.try_number)
        self.log_file = os.path.join(self.airflow_base_log_dir,
                                     log_relative_path)

        # make sure we are not polluting the airflow logs
        get_dbnd_project_config().quiet_mode = True

        # tracking msg
        self.airflow_logger.info("Databand Tracking Started {version}".format(
            version=dbnd.__version__))

        # context with disabled logs
        self.dbnd_context_manage = new_dbnd_context(
            conf={"log": {
                "disabled": True
            }})
        self.dbnd_context = self.dbnd_context_manage.__enter__()