def _sig(task): with new_databand_run(context=get_databand_context(), job_name=task.task_name): for child in [task] + list(task.descendants.get_children()): name = "signature %s" % child.task_name logger.info(child.ctrl.banner(name)) return task.task_signature
def _enter_databand_run(self, scheduled_tasks=None): self.init_context() if scheduled_tasks and not self.root_dbnd_task: # This code is reached only when an orphaned task is executed self._root_dbnd_task = self.get_dbnd_task( first(scheduled_tasks.values())) self._databand_run = self._enter_cm( new_databand_run(context=self._databand_context, task_or_task_name=self.root_dbnd_task)) # Start the tracking process self._databand_run._init_without_run() self._driver_task_run = self._databand_run.driver_task_run print_tasks_tree(self._databand_run.root_task_run.task, self._databand_run.task_runs) self._enter_cm( self._driver_task_run.runner.task_run_execution_context()) for task in self._missing_dep: task.current_task_run.set_task_run_state( state=TaskRunState.UPSTREAM_FAILED) if not self.get_non_finished_sub_tasks(): self.finish_run(TaskRunState.FAILED) else: self._driver_task_run.set_task_run_state( state=TaskRunState.RUNNING)
def dbnd_run_task( self, task_or_task_name, run_uid=None, scheduled_run_info=None, send_heartbeat=True, ): # type: (Optional[Task,str], Optional[UUID], ScheduledRunInfo, bool) -> DatabandRun """ This is the main entry point to run task in "dbnd orchestration" mode called from `dbnd run` we create a new Run + RunExecutor and trigger the execution :param task_or_task_name task name to run or already built task object :return DatabandRun """ job_name = get_task_name_safe(task_or_task_name) with new_databand_run( context=self, job_name=job_name, run_uid=run_uid, scheduled_run_info=scheduled_run_info, is_orchestration=True, ) as run: # type: DatabandRun # this is the main entry point to run some task in "orchestration" mode run.run_executor = RunExecutor( run=run, root_task_or_task_name=task_or_task_name, send_heartbeat=send_heartbeat, ) run.run_executor.run_execute() return run
def _sig(task): name = "signature %s" % task.task_name with new_databand_run(context=get_databand_context(), task_or_task_name=task): logger.info(task.ctrl.banner(name)) for child in task.task_meta.get_children(): logger.info(child.ctrl.banner(name)) return task.task_signature
def start( self, root_task_name, in_memory=True, run_uid=None, airflow_context=False, job_name=None, ): if try_get_databand_context(): return if not airflow_context and not self._atexit_registered: atexit.register(self.stop) if is_airflow_enabled(): from airflow.settings import dispose_orm atexit.unregister(dispose_orm) c = { "run": { "skip_completed": False }, # we don't want to "check" as script is task_version="now" "task": { "task_in_memory_outputs": in_memory }, # do not save any outputs } config.set_values(config_values=c, override=True, source="dbnd_start") context_kwargs = {"name": "airflow"} if airflow_context else {} # create databand context dc = self._enter_cm( new_dbnd_context(**context_kwargs)) # type: DatabandContext root_task = _build_inline_root_task(root_task_name, airflow_context=airflow_context) # create databand run dr = self._enter_cm( new_databand_run( context=dc, task_or_task_name=root_task, run_uid=run_uid, existing_run=False, job_name=job_name, )) # type: DatabandRun if run_uid: root_task_run_uid = get_task_run_uid(run_uid, root_task_name) else: root_task_run_uid = None dr._init_without_run(root_task_run_uid=root_task_run_uid) self._start_taskrun(dr.driver_task_run) self._start_taskrun(dr.root_task_run) return dr
def start(self, root_task_name=None, airflow_context=None): if self._run or self._active or try_get_databand_run(): return # we probably should use only airlfow context via parameter. # also, there are mocks that cover only get_dbnd_project_config().airflow_context airflow_context = airflow_context or get_dbnd_project_config().airflow_context() set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context) dc = self._enter_cm( new_dbnd_context(name="inplace_tracking") ) # type: DatabandContext if airflow_context: root_task, job_name, source = build_run_time_airflow_task(airflow_context) else: root_task = _build_inline_root_task(root_task_name) job_name = root_task.task_name source = UpdateSource.dbnd self._run = run = self._enter_cm( new_databand_run( context=dc, job_name=job_name, existing_run=False, source=source, af_context=airflow_context, ) ) # type: DatabandRun self._run.root_task = root_task if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB root_task_run = run._build_and_add_task_run(root_task) root_task_run.is_root = True # No need to track the state because we track in init_run run.root_task_run.set_task_run_state(TaskRunState.RUNNING, track=False) run.tracker.init_run() self._enter_cm(run.root_task_run.runner.task_run_execution_context()) self._task_run = run.root_task_run return self._task_run
def start(self, root_task_name, job_name=None): if self._run or self._active or try_get_databand_run(): return airflow_context = try_get_airflow_context() set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context) # 1. create proper DatabandContext so we can create other objects dc = self._enter_cm(new_dbnd_context()) # type: DatabandContext if airflow_context: root_task_or_task_name = AirflowOperatorRuntimeTask.build_from_airflow_context( airflow_context ) source = UpdateSource.airflow_tracking job_name = "{}.{}".format(airflow_context.dag_id, airflow_context.task_id) else: root_task_or_task_name = _build_inline_root_task(root_task_name) source = UpdateSource.dbnd # create databand run # this will create databand run with driver and root tasks. # create databand run # we will want to preserve self._run = self._enter_cm( new_databand_run( context=dc, task_or_task_name=root_task_or_task_name, job_name=job_name, existing_run=False, source=source, af_context=airflow_context, ) ) # type: DatabandRun if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB self._run._init_without_run() self._start_taskrun(self._run.driver_task_run) self._start_taskrun(self._run.root_task_run) self._task_run = self._run.root_task_run return self._task_run
def dbnd_run_task( self, task_or_task_name, run_uid=None, scheduled_run_info=None, send_heartbeat=True, ): # type: (Optional[Task,str], Optional[UUID], ScheduledRunInfo, bool) -> DatabandRun with new_databand_run( context=self, task_or_task_name=task_or_task_name, run_uid=run_uid, scheduled_run_info=scheduled_run_info, send_heartbeat=send_heartbeat, ) as run: # type: DatabandRun run.run_driver() return run
def start(self, root_task_name, job_name=None): if self._run: return if self._started or self._disabled: # started or failed return try: if try_get_databand_run(): return self._started = True # 1. create proper DatabandContext so we can create other objects set_tracking_config_overide(use_dbnd_log=True) # create databand context dc = self._enter_cm(new_dbnd_context()) # type: DatabandContext root_task = _build_inline_root_task(root_task_name) # create databand run self._run = self._enter_cm( new_databand_run( context=dc, task_or_task_name=root_task, existing_run=False, job_name=job_name, )) # type: DatabandRun self._run._init_without_run() if not self._atexit_registered: atexit.register(self.stop) sys.excepthook = self.stop_on_exception self._start_taskrun(self._run.driver_task_run) self._start_taskrun(self._run.root_task_run) self._task_run = self._run.root_task_run return self._task_run except Exception: _handle_inline_error("inline-start") self._disabled = True return finally: self._started = True
def start(self, root_task_name=None, airflow_context=None): if self._run or self._active or try_get_databand_run(): return airflow_context = airflow_context or try_get_airflow_context() set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context) dc = self._enter_cm(new_dbnd_context()) # type: DatabandContext if airflow_context: root_task, job_name, source = build_run_time_airflow_task( airflow_context) else: root_task = _build_inline_root_task(root_task_name) job_name = None source = UpdateSource.dbnd self._run = self._enter_cm( new_databand_run( context=dc, task_or_task_name=root_task, job_name=job_name, existing_run=False, source=source, af_context=airflow_context, send_heartbeat=False, )) # type: DatabandRun if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB self._run._init_without_run() self._start_taskrun(self._run.driver_task_run) self._start_taskrun(self._run.root_task_run) self._task_run = self._run.root_task_run return self._task_run
def dbnd_run_task( self, task_or_task_name, # type: Union[Task, str] job_name=None, # type: Optional[str] force_task_name=None, # type: Optional[str] project=None, # type: Optional[str] run_uid=None, # type: Optional[UUID] existing_run=None, # type: Optional[bool] scheduled_run_info=None, # type: Optional[ScheduledRunInfo] send_heartbeat=True, # type: bool ): # type: (...) -> DatabandRun """ This is the main entry point to run task in "dbnd orchestration" mode called from `dbnd run` we create a new Run + RunExecutor and trigger the execution :param task_or_task_name task name to run or already built task object :param force_task_name :param project Project name for the run :return DatabandRun """ job_name = get_task_name_safe(job_name or task_or_task_name) project_name = self._get_project_name(project, task_or_task_name) with new_databand_run( context=self, job_name=job_name, run_uid=run_uid, existing_run=existing_run, scheduled_run_info=scheduled_run_info, is_orchestration=True, project_name=project_name, ) as run: # type: DatabandRun # this is the main entry point to run some task in "orchestration" mode run.run_executor = RunExecutor( run=run, root_task_or_task_name=task_or_task_name, force_task_name=force_task_name, send_heartbeat=send_heartbeat, ) run.run_executor.run_execute() return run
def _enter_databand_run(self, scheduled_tasks=None): self.init_context() if scheduled_tasks and not self.root_dbnd_task: # This code is reached only when an orphaned task is executed self.root_dbnd_task = self.get_dbnd_task(first(scheduled_tasks.values())) self._databand_run = run = self._enter_cm( new_databand_run( context=self._databand_context, job_name=self.root_dbnd_task.task_name ) ) # type: DatabandRun self._driver_task_run = run.build_and_set_driver_task_run( driver_task=Task(task_name=SystemTaskName.driver, task_is_system=True) ) self._driver_task_run.task.descendants.add_child(self.root_dbnd_task.task_id) # assign root task run.root_task = self.root_dbnd_task # Start the tracking process for task in run.root_task.task_dag.subdag_tasks(): run._build_and_add_task_run(task) # we only update states not submitting data self._driver_task_run.set_task_run_state( state=TaskRunState.RUNNING, track=False ) for task in self._missing_dep: task.current_task_run.set_task_run_state( state=TaskRunState.UPSTREAM_FAILED, track=False ) run.tracker.init_run() self._enter_cm(self._driver_task_run.runner.task_run_execution_context()) print_tasks_tree(run.root_task_run.task, run.task_runs) if not self.get_non_finished_sub_tasks(): # we have no more tasks to run.. probably it's a failure self.finish_run(TaskRunState.FAILED) return
def initialized_run(task_or_task_name): with new_databand_run(context=dbnd_context(), task_or_task_name=task_or_task_name) as r: r._init_without_run() yield r
def execute(self, context): logger.debug("Running dbnd dbnd_task from airflow operator %s", self.task_id) dag = context["dag"] execution_date = context["execution_date"] dag_id = dag.dag_id run_uid = get_job_run_uid(dag_id=dag_id, execution_date=execution_date) # Airflow has updated all relevant fields in Operator definition with XCom values # now we can create a real dbnd dbnd_task with real references to dbnd_task new_kwargs = {} for p_name in self.dbnd_task_params_fields: new_kwargs[p_name] = getattr(self, p_name, None) # this is the real input value after if p_name in self.dbnd_xcom_inputs: new_kwargs[p_name] = target(new_kwargs[p_name]) new_kwargs["_dbnd_disable_airflow_inplace"] = True dag_ctrl = self.get_dbnd_dag_ctrl() with DatabandContext.context(_context=dag_ctrl.dbnd_context) as dc: logger.debug("Running %s with kwargs=%s ", self.task_id, new_kwargs) dbnd_task = dc.task_instance_cache.get_task_by_id(self.dbnd_task_id) # rebuild task with new values coming from xcom->operator with dbnd_task.ctrl.task_context(phase=TaskContextPhase.BUILD): dbnd_task = dbnd_task.clone( output_params_to_clone=self.dbnd_overridden_output_params, **new_kwargs ) logger.debug("Creating inplace databand run for driver dump") dag_task = Task(task_name=dag.dag_id, task_target_date=execution_date) dag_task.set_upstream(dbnd_task) # create databand run with new_databand_run( context=dc, task_or_task_name=dag_task, run_uid=run_uid, existing_run=False, job_name=dag.dag_id, ) as dr: # type: DatabandRun dr._init_without_run() # dr.driver_task_run.set_task_run_state(state=TaskRunState.RUNNING) # "make dag run" # dr.root_task_run.set_task_run_state(state=TaskRunState.RUNNING) dbnd_task_run = dr.get_task_run_by_id(dbnd_task.task_id) needs_databand_run_save = dbnd_task._conf__require_run_dump_file if needs_databand_run_save: dr.save_run() logger.info( dbnd_task.ctrl.banner( "Running task '%s'." % dbnd_task.task_name, color="cyan" ) ) # should be replaced with tr._execute call dbnd_task_run.runner.execute( airflow_context=context, handle_sigterm=False ) logger.debug("Finished to run %s", self) result = { output_name: convert_to_safe_types(getattr(dbnd_task, output_name)) for output_name in self.dbnd_xcom_outputs } return result
def start(self, root_task_name=None, project_name=None, airflow_context=None): if self._run or self._active or try_get_databand_run(): return # we probably should use only airlfow context via parameter. # also, there are mocks that cover only get_dbnd_project_config().airflow_context airflow_context = airflow_context or get_dbnd_project_config( ).airflow_context() if airflow_context: _set_dbnd_config_from_airflow_connections() _set_tracking_config_overide(airflow_context=airflow_context) dc = self._enter_cm( new_dbnd_context(name="inplace_tracking")) # type: DatabandContext if not root_task_name: # extract the name of the script we are running (in Airflow scenario it will be just "airflow") root_task_name = sys.argv[0].split(os.path.sep)[-1] if airflow_context: root_task, job_name, source, run_uid = build_run_time_airflow_task( airflow_context, root_task_name) try_number = airflow_context.try_number else: root_task = _build_inline_root_task(root_task_name) job_name = root_task_name source = UpdateSource.generic_tracking run_uid = None try_number = 1 tracking_source = ( None # TODO_CORE build tracking_source -> typeof TrackingSourceSchema ) self._run = run = self._enter_cm( new_databand_run( context=dc, job_name=job_name, run_uid=run_uid, existing_run=run_uid is not None, source=source, af_context=airflow_context, tracking_source=tracking_source, project_name=project_name, )) # type: DatabandRun self._run.root_task = root_task self.update_run_from_airflow_context(airflow_context) if not self._atexit_registered: _set_process_exit_handler(self.stop) self._atexit_registered = True sys.excepthook = self.stop_on_exception self._active = True # now we send data to DB root_task_run = run._build_and_add_task_run( root_task, task_af_id=root_task.task_name, try_number=try_number) root_task_run.is_root = True run.tracker.init_run() run.root_task_run.set_task_run_state(TaskRunState.RUNNING) should_capture_log = TrackingConfig.from_databand_context( ).capture_tracking_log self._enter_cm( run.root_task_run.runner.task_run_execution_context( capture_log=should_capture_log, handle_sigterm=False)) self._task_run = run.root_task_run return self._task_run
def __init__(self, af_context): # type: (AirflowTaskContext) -> None self.run_uid = get_job_run_uid( dag_id=af_context.root_dag_id, execution_date=af_context.execution_date) self.dag_id = af_context.dag_id # this is the real operator uid, we need to connect to it with our "tracked" task, # so the moment monitor is on -> we can sync af_runtime_op_task_id = af_context.task_id self.af_operator_sync__task_run_uid = get_task_run_uid( self.run_uid, af_context.dag_id, af_runtime_op_task_id) # 1. create proper DatabandContext so we can create other objects set_tracking_config_overide( use_dbnd_log=override_airflow_log_system_for_tracking()) # create databand context with new_dbnd_context(name="airflow") as dc: # type: DatabandContext # now create "operator" task for current task_id, # we can't actually run it, we even don't know when it's going to finish # current execution is inside the operator, this is the only thing we know # STATE AFTER INIT: # AirflowOperator__runtime -> DAG__runtime task_target_date = pendulum.parse(af_context.execution_date, tz=pytz.UTC).date() # AIRFLOW OPERATOR RUNTIME af_runtime_op = AirflowOperatorRuntimeTask( task_family=task_name_for_runtime(af_runtime_op_task_id), dag_id=af_context.dag_id, execution_date=af_context.execution_date, task_target_date=task_target_date, task_version="%s:%s" % (af_runtime_op_task_id, af_context.execution_date), ) # this is the real operator uid, we need to connect to it with our "tracked" task, # so the moment monitor is on -> we can sync af_db_op_task_run_uid = get_task_run_uid(self.run_uid, af_context.dag_id, af_runtime_op_task_id) af_runtime_op.task_meta.extra_parents_task_run_uids.add( af_db_op_task_run_uid) af_runtime_op.ctrl.force_task_run_uid = TaskRunUidGen_TaskAfId( af_context.dag_id) self.af_operator_runtime__task = af_runtime_op # AIRFLOW DAG RUNTIME self.af_dag_runtime__task = AirflowDagRuntimeTask( task_name=task_name_for_runtime(DAG_SPECIAL_TASK_ID), dag_id=af_context.root_dag_id, # <- ROOT DAG! execution_date=af_context.execution_date, task_target_date=task_target_date, ) _add_child(self.af_dag_runtime__task, self.af_operator_runtime__task) # this will create databand run with driver and root tasks. # we need the "root" task to be the same between different airflow tasks invocations # since in dbnd we must have single root task, so we create "dummy" task with dag_id name # create databand run # we will want to preserve with new_databand_run( context=dc, task_or_task_name=self.af_dag_runtime__task, run_uid=self.run_uid, existing_run=False, job_name=af_context.root_dag_id, send_heartbeat=False, # we don't send heartbeat in tracking source=UpdateSource.airflow_tracking, ) as dr: self.dr = dr dr._init_without_run() self.airflow_operator__task_run = dr.get_task_run_by_id( af_runtime_op.task_id)