def register_dbnd_plugins(): if get_dbnd_project_config().is_no_plugins: return global _dbnd_plugins_registered if _dbnd_plugins_registered: return _dbnd_plugins_registered = True fix_sys_path_str() if not get_dbnd_project_config().disable_pluggy_entrypoint_loading: pm.load_setuptools_entrypoints("dbnd") pm.check_pending()
def _default_configuration_paths(): # we always have "library config" yield databand_config_path("databand-core.cfg") system_config = os.path.expanduser("/etc/databand.cfg") if os.path.isfile(system_config): yield system_config # now we can start to look for project configs dbnd_project_config = get_dbnd_project_config() possible_locations = [ databand_system_path("databand-system.cfg"), dbnd_project_config.dbnd_project_path("conf", "databand.cfg"), # deprecated dbnd_project_config.dbnd_project_path("databand.cfg"), # deprecated get_dbnd_custom_config(), dbnd_project_config.dbnd_project_path("project.cfg"), ] env_config = get_dbnd_environ_config_file() if env_config: possible_locations.append(env_config) for value in possible_locations: value = expand_env_var(value) if os.path.isfile(value): yield value user_config = expand_env_var("~/.dbnd/databand.cfg") if os.path.isfile(user_config): yield user_config if is_unit_test_mode(): tests_config_path = databand_system_path("databand-test.cfg") if os.path.exists(tests_config_path): yield tests_config_path
def build_init_args(self): # type: () -> InitRunArgs run = self.run task_run_info = self.build_task_runs_info(run.task_runs) init_args = InitRunArgs( run_uid=run.run_uid, root_run_uid=run.root_run_info.root_run_uid, task_runs_info=task_run_info, driver_task_uid=run.driver_task_run.task_run_uid, task_run_env=run.context.task_run_env, source=run.source, af_context=run.af_context, ) if not run.existing_run or get_dbnd_project_config().resubmit_run: # even if it's existing run, may be we are running from Airflow # so the run is actually "submitted", ( the root airflow job has no info.., # we want to capture "real" info of the run init_args.new_run_info = self._run_to_run_info() if run.scheduled_run_info: init_args.scheduled_run_info = run.scheduled_run_info if run.root_run_info.root_task_run_uid: rel = (run.root_run_info.root_task_run_uid, init_args.driver_task_uid) task_run_info.parent_child_map.add(rel) task_run_info.upstreams_map.add(rel) return init_args
def tracking_start_base(job_name, project_name=None, airflow_context=None): """ Starts handler for tracking the current running script. Would not start a new one if script manager if already exists """ dbnd_project_config = get_dbnd_project_config() if dbnd_project_config.disabled: # we are not tracking if dbnd is disabled return None global _dbnd_script_manager if not _dbnd_script_manager: # setting the context to tracking to prevent conflicts from dbnd orchestration dbnd_project_config._dbnd_tracking = True dsm = _DbndScriptTrackingManager() try: # we use job name for both job_name and root_task_name of the run dsm.start(job_name, project_name, airflow_context) if dsm._active: _dbnd_script_manager = dsm except Exception: _handle_tracking_error("dbnd-tracking-start") # disabling the project so we don't start any new handler in this execution dbnd_project_config.disabled = True return None if _dbnd_script_manager and _dbnd_script_manager._active: # this is the root task run of the tracking, its representing the script context. return _dbnd_script_manager._task_run
def __init__(self, task_cls): self.task_cls = task_cls # type: Type[Task] # this will make class look like a origin function self.task_func = self.task_cls._conf__decorator_spec.item functools.update_wrapper(self, self.task_func) self._call_count = 0 self._call_as_func = False self._max_call_count = get_dbnd_project_config().max_calls_per_run
def dbnd_run_start(name=None): if get_dbnd_project_config().disabled: return None global _dbnd_inline_manager if not _dbnd_inline_manager: dsm = _DbndInplaceRunManager() try: dsm.start(root_task_name=name) if dsm._active: _dbnd_inline_manager = dsm except Exception: _handle_inline_run_error("inline-start") get_dbnd_project_config().disabled = True return None if _dbnd_inline_manager and _dbnd_inline_manager._active: return _dbnd_inline_manager._task_run
def is_verbose(): context = try_get_databand_context() if context and getattr(context, "system_settings", None): if context.system_settings.verbose: # only if True, otherwise check project config too return True return get_dbnd_project_config().is_verbose()
def __init__(self, task_decorator): # type: (CallableTrackingManager, TaskDecorator) -> None self.task_decorator = task_decorator self._tracking_task_definition = None self._call_count = 0 self._call_as_func = False self._max_call_count = get_dbnd_project_config().max_calls_per_run
def dbnd_run_start(name=None, airflow_context=None): if get_dbnd_project_config().disabled: return None global _dbnd_inline_manager if not _dbnd_inline_manager: dsm = _DbndInplaceRunManager() try: dsm.start(name, airflow_context) if dsm._active: _dbnd_inline_manager = dsm except Exception as e: logger.error(e, exc_info=True) _handle_inline_run_error("inline-start") get_dbnd_project_config().disabled = True return None if _dbnd_inline_manager and _dbnd_inline_manager._active: return _dbnd_inline_manager._task_run
def _is_airflow_enabled(): if get_dbnd_project_config().is_no_modules: return False if pm.has_plugin("dbnd-airflow"): return True # TODO: make decision based on plugin only try: import dbnd_airflow # noqa: F401 return True except ImportError: return False
def start(self, root_task_name=None, airflow_context=None): if self._run or self._active or try_get_databand_run(): return # we probably should use only airlfow context via parameter. # also, there are mocks that cover only get_dbnd_project_config().airflow_context airflow_context = airflow_context or get_dbnd_project_config().airflow_context() set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context) dc = self._enter_cm( new_dbnd_context(name="inplace_tracking") ) # type: DatabandContext if airflow_context: root_task, job_name, source = build_run_time_airflow_task(airflow_context) else: root_task = _build_inline_root_task(root_task_name) job_name = root_task.task_name source = UpdateSource.dbnd self._run = run = self._enter_cm( new_databand_run( context=dc, job_name=job_name, existing_run=False, source=source, af_context=airflow_context, ) ) # type: DatabandRun self._run.root_task = root_task if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB root_task_run = run._build_and_add_task_run(root_task) root_task_run.is_root = True # No need to track the state because we track in init_run run.root_task_run.set_task_run_state(TaskRunState.RUNNING, track=False) run.tracker.init_run() self._enter_cm(run.root_task_run.runner.task_run_execution_context()) self._task_run = run.root_task_run return self._task_run
def __init__(self, func_spec, task_type, task_defaults): # type: (TaskClsBuilder, _TaskDecoratorSpec, Type[_DecoratedTask], Any) -> None self.func_spec = func_spec self.task_type = task_type self.task_defaults = task_defaults self._normal_task_cls = None self._tracking_task_cls = None # self.task_cls = task_cls # type: Type[Task] # this will make class look like a origin function functools.update_wrapper(self, self.func) self._call_count = 0 self._call_as_func = False self._max_call_count = get_dbnd_project_config().max_calls_per_run self._callable_item = None
def dbnd_run_start(name=None, airflow_context=None): dbnd_project_config = get_dbnd_project_config() if dbnd_project_config.disabled: return None global _dbnd_script_manager if not _dbnd_script_manager: dbnd_project_config._dbnd_tracking = True dsm = _DbndScriptTrackingManager() try: dsm.start(name, airflow_context) if dsm._active: _dbnd_script_manager = dsm except Exception as e: logger.error(e, exc_info=True) _handle_tracking_error("dbnd-tracking-start") dbnd_project_config.disabled = True return None if _dbnd_script_manager and _dbnd_script_manager._active: return _dbnd_script_manager._task_run
def _is_verbose(): config = get_dbnd_project_config() return config.is_verbose()
def try_get_inplace_tracking_task_run(): # type: ()->Optional[TaskRun] if get_dbnd_project_config().is_tracking_mode(): return dbnd_tracking_start()
def start(self, root_task_name=None, project_name=None, airflow_context=None): if self._run or self._active or try_get_databand_run(): return # we probably should use only airlfow context via parameter. # also, there are mocks that cover only get_dbnd_project_config().airflow_context airflow_context = airflow_context or get_dbnd_project_config( ).airflow_context() if airflow_context: _set_dbnd_config_from_airflow_connections() _set_tracking_config_overide(airflow_context=airflow_context) dc = self._enter_cm( new_dbnd_context(name="inplace_tracking")) # type: DatabandContext if not root_task_name: # extract the name of the script we are running (in Airflow scenario it will be just "airflow") root_task_name = sys.argv[0].split(os.path.sep)[-1] if airflow_context: root_task, job_name, source, run_uid = build_run_time_airflow_task( airflow_context, root_task_name) try_number = airflow_context.try_number else: root_task = _build_inline_root_task(root_task_name) job_name = root_task_name source = UpdateSource.generic_tracking run_uid = None try_number = 1 tracking_source = ( None # TODO_CORE build tracking_source -> typeof TrackingSourceSchema ) self._run = run = self._enter_cm( new_databand_run( context=dc, job_name=job_name, run_uid=run_uid, existing_run=run_uid is not None, source=source, af_context=airflow_context, tracking_source=tracking_source, project_name=project_name, )) # type: DatabandRun self._run.root_task = root_task self.update_run_from_airflow_context(airflow_context) if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB root_task_run = run._build_and_add_task_run( root_task, task_af_id=root_task.task_name, try_number=try_number) root_task_run.is_root = True run.tracker.init_run() run.root_task_run.set_task_run_state(TaskRunState.RUNNING) should_capture_log = TrackingConfig.from_databand_context( ).capture_tracking_log self._enter_cm( run.root_task_run.runner.task_run_execution_context( capture_log=should_capture_log, handle_sigterm=False)) self._task_run = run.root_task_run return self._task_run
def _call_handler(task_cls, call_user_code, call_args, call_kwargs): """ -= Use "Step into My Code"" to get back from Databand code! =- decorated object call/creation ( my_func(), MyDecoratedTask() """ force_invoke = call_kwargs.pop("__force_invoke", False) dbnd_project_config = get_dbnd_project_config() if force_invoke or dbnd_project_config.disabled: # 1. Databand is not enabled # 2. we have this call coming from Task.run / Task.band direct invocation return call_user_code(*call_args, **call_kwargs) func_call = FuncCall( task_cls=task_cls, call_args=call_args, call_kwargs=call_kwargs, call_user_code=call_user_code, ) if is_in_airflow_dag_build_context( ): # we are in Airflow DAG building mode return build_task_at_airflow_dag_context(task_cls=task_cls, call_args=call_args, call_kwargs=call_kwargs) current = try_get_current_task() if not current: from dbnd._core.tracking.script_tracking_manager import ( try_get_inplace_tracking_task_run, ) task_run = try_get_inplace_tracking_task_run() if task_run: current = task_run.task if not current: # direct call to the function return func_call.invoke() ###### # current is not None, and we are not in trackign/airflow/luigi # DBND Orchestration mode # we can be in the context of .run() or in .band() # called from user code using some_func() or SomeTask() # this call path is not coming from it's not coming from _invoke_func phase = current_phase() if phase is TaskContextPhase.BUILD: # we are in the @pipeline context, we are building execution plan t = task_cls(*call_args, **call_kwargs) # we are in inline debug mode -> we are going to execute the task # we are in the band # and want to return result of the object if t.task_definition.single_result_output: return t.result # we have multiple outputs ( result, another output.. ) # -> just return task object return t if phase is TaskContextPhase.RUN: # we are in the run function! if (current.settings.dynamic_task.enabled and current.task_supports_dynamic_tasks): # isinstance() check required to prevent infinite recursion when @task is on # class and not on func (example: see test_task_decorated_class.py) # and the current task supports inline calls # that's extra mechanism in addition to __force_invoke # on pickle/unpickle isinstance fails to run. return create_and_run_dynamic_task_safe(func_call=func_call, parent_task_run=current) # we can not call it in"databand" way, fallback to normal execution return func_call.invoke()
def _is_tracking_mode(): return get_dbnd_project_config().is_tracking_mode()