Beispiel #1
0
def get_value_meta_from_value(name, value, meta_conf):
    from dbnd._core.settings import TrackingConfig
    from dbnd._core.settings.tracking_config import ValueTrackingLevel, get_value_meta

    c = TrackingConfig.from_databand_context()
    c.value_reporting_strategy = ValueTrackingLevel.ALL
    return get_value_meta(value, meta_conf, tracking_config=c)
Beispiel #2
0
def log_sql_targets(conn_str, path, sql_query, is_succeed):
    try:
        for target_op in extract_from_sql(path, sql_query):
            target_op = attr.evolve(target_op, success=is_succeed)

            if (target_op.path.startswith("snowflake")
                    and TrackingConfig.from_databand_context(
                    ).value_reporting_strategy == ValueTrackingLevel.ALL
                    and not target_op.name.startswith("@")):
                # 1) snowflake tables are lazy evaluated types
                # we support they been logs only if the the strategy is All
                # 2) staging is not supported yet
                log_snowflake_table_targets(table_op=target_op,
                                            connection_string=conn_str)

            else:
                # the target can actually be a non table like S3 file that used
                # as part of the sql query
                log_dataset_op(
                    op_path=target_op.path,
                    op_type=target_op.operation,
                    success=target_op.success,
                )
    except Exception:
        logger.exception(
            "Caught exception will trying to log target for sql_query")
Beispiel #3
0
def _hide_or_show_by_config(source):
    from dbnd._core.settings import TrackingConfig

    tracking_config = TrackingConfig.from_databand_context()
    if tracking_config.track_source_code:
        return source
    else:
        return None
def tracking_config_empty():
    # Enforce "tracking" config section so that changes in config files won't affect tests
    with config(
        {"tracking": {}},
            source="dbnd_test_context",
            merge_settings=_ConfigMergeSettings(replace_section=True),
    ):
        return TrackingConfig()
Beispiel #5
0
def _get_databand_url():
    try:
        external = TrackingConfig().databand_external_url
        if external:
            return external
        return CoreConfig().databand_url
    except Exception:
        pass
def tracking_config_empty():
    # Enforce "tracking" config section so that changes in config files won't affect tests
    with config(
        {
            "tracking":
            replace_section_with(
                {"value_reporting_strategy": ValueTrackingLevel.ALL})
        },
            source="dbnd_test_context",
    ):
        return TrackingConfig()
def tracking_config_force_true():
    # Enforce "tracking" config section so that changes in config files won't affect tests
    config.set(
        "tracking",
        "value_reporting_strategy",
        ValueTrackingLevel.ALL,
        priority=ConfigValuePriority.OVERRIDE,
    )
    config.set("tracking",
               "log_value_stats",
               True,
               priority=ConfigValuePriority.OVERRIDE)
    config.set("tracking",
               "log_histograms",
               True,
               priority=ConfigValuePriority.OVERRIDE)
    return TrackingConfig()
Beispiel #8
0
    def start(self,
              root_task_name=None,
              project_name=None,
              airflow_context=None):
        if self._run or self._active or try_get_databand_run():
            return

        # we probably should use only airlfow context via parameter.
        # also, there are mocks that cover only get_dbnd_project_config().airflow_context
        airflow_context = airflow_context or get_dbnd_project_config(
        ).airflow_context()
        if airflow_context:
            _set_dbnd_config_from_airflow_connections()

        _set_tracking_config_overide(airflow_context=airflow_context)
        dc = self._enter_cm(
            new_dbnd_context(name="inplace_tracking"))  # type: DatabandContext

        if not root_task_name:
            # extract the name of the script we are running (in Airflow scenario it will be just "airflow")
            root_task_name = sys.argv[0].split(os.path.sep)[-1]

        if airflow_context:
            root_task, job_name, source, run_uid = build_run_time_airflow_task(
                airflow_context, root_task_name)
            try_number = airflow_context.try_number
        else:
            root_task = _build_inline_root_task(root_task_name)
            job_name = root_task_name
            source = UpdateSource.generic_tracking
            run_uid = None
            try_number = 1

        tracking_source = (
            None  # TODO_CORE build tracking_source -> typeof TrackingSourceSchema
        )
        self._run = run = self._enter_cm(
            new_databand_run(
                context=dc,
                job_name=job_name,
                run_uid=run_uid,
                existing_run=run_uid is not None,
                source=source,
                af_context=airflow_context,
                tracking_source=tracking_source,
                project_name=project_name,
            ))  # type: DatabandRun

        self._run.root_task = root_task

        self.update_run_from_airflow_context(airflow_context)

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        root_task_run = run._build_and_add_task_run(
            root_task, task_af_id=root_task.task_name, try_number=try_number)

        root_task_run.is_root = True

        run.tracker.init_run()
        run.root_task_run.set_task_run_state(TaskRunState.RUNNING)

        should_capture_log = TrackingConfig.from_databand_context(
        ).capture_tracking_log
        self._enter_cm(
            run.root_task_run.runner.task_run_execution_context(
                capture_log=should_capture_log, handle_sigterm=False))
        self._task_run = run.root_task_run

        return self._task_run
def tracking_config_force_false():
    # Enforce "tracking" config section so that changes in config files won't affect tests
    config.set("tracking", "log_value_stats", False, override=True)
    config.set("tracking", "log_histograms", False, override=True)
    return TrackingConfig()
Beispiel #10
0
    def tracking_context(self, call_args, call_kwargs):
        user_code_called = False  # whether we got to executing of user code
        user_code_finished = False  # whether we passed executing of user code
        func_call = None
        try:
            # 1. check that we don't have too many calls
            if self._call_count_limit_exceeded():
                yield _do_nothing_decorator
                return

            # 2. Start or reuse existing "main tracking task" that is root for tracked tasks
            if not try_get_current_task():
                """
                try to get existing task, and if not exists - try to get/create inplace_task_run
                """
                from dbnd._core.tracking.script_tracking_manager import (
                    try_get_inplace_tracking_task_run, )

                inplace_tacking_task = try_get_inplace_tracking_task_run()
                if not inplace_tacking_task:
                    # we didn't manage to start inplace tracking task run, we will not be able to track
                    yield _do_nothing_decorator
                    return

            tracking_task_definition = self.get_tracking_task_definition()
            callable_spec = tracking_task_definition.task_decorator.get_callable_spec(
            )

            func_call = TrackedFuncCallWithResult(
                callable=self.callable,
                call_args=tuple(
                    call_args),  # prevent original call_args modification
                call_kwargs=dict(
                    call_kwargs),  # prevent original kwargs modification
            )
            # replace any position argument with kwarg if it possible
            args, kwargs = args_to_kwargs(callable_spec.args,
                                          func_call.call_args,
                                          func_call.call_kwargs)

            # instantiate inline task
            task = TrackingTask.for_func(tracking_task_definition, args,
                                         kwargs)

            # update upstream/downstream relations - needed for correct tracking
            # we can have the task as upstream , as it was executed already
            parent_task = current_task_run().task
            if not parent_task.task_dag.has_upstream(task):
                parent_task.set_upstream(task)

            # checking if any of the inputs are the outputs of previous task.
            # we can add that task as upstream.
            dbnd_run = get_databand_run()
            call_kwargs_as_targets = dbnd_run.target_origin.get_for_map(kwargs)
            for value_origin in call_kwargs_as_targets.values():
                up_task = value_origin.origin_target.task
                task.set_upstream(up_task)

            # creating task_run as a task we found mid-run
            task_run = dbnd_run.create_task_run_at_execution_time(
                task, task_engine=current_task_run().task_engine)

            should_capture_log = (
                TrackingConfig.from_databand_context().capture_tracking_log)
            with task_run.runner.task_run_execution_context(
                    handle_sigterm=True, capture_log=should_capture_log):
                task_run.set_task_run_state(state=TaskRunState.RUNNING)

                _log_inputs(task_run)

                # if we reached this line, then all tracking initialization is
                # finished successfully, and we're going to execute user code
                user_code_called = True

                try:
                    # tracking_context is context manager - user code will run on yield
                    yield func_call.set_result

                    # if we reached this line, this means that user code finished
                    # successfully without any exceptions
                    user_code_finished = True
                # We catch BaseException since we want to catch KeyboardInterrupts as well
                except BaseException as ex:
                    task_run.finished_time = utcnow()

                    error = TaskRunError.build_from_ex(ex, task_run)
                    task_run.set_task_run_state(TaskRunState.FAILED,
                                                error=error)
                    raise

                else:
                    task_run.finished_time = utcnow()

                    # func_call.result should contain result, log it
                    _log_result(task_run, func_call.result)

                    task_run.set_task_run_state(TaskRunState.SUCCESS)
        except BaseException:
            if user_code_called and not user_code_finished:
                # if we started to call the user code and not got to user_code_finished
                # line - it means there was user code exception - so just re-raise it
                raise
            # else it's either we didn't reached calling user code, or already passed it
            # then it's some dbnd tracking error - just log it
            if func_call:
                _handle_tracking_error("tracking-init", func_call)
            else:
                log_exception_to_server()
        # if we didn't reached user_code_called=True line - there was an error during
        # dbnd tracking initialization, so nothing is done - user function wasn't called yet
        if not user_code_called:
            # tracking_context is context manager - user code will run on yield
            yield _do_nothing_decorator
            return