def update_run_from_airflow_context(self, airflow_context): if not airflow_context or not airflow_context.context: return dag = airflow_context.context.get("dag", None) if not dag: return dag_tags = getattr(dag, "tags", []) project_name = get_project_name_from_airflow_tags(dag_tags) airflow_user = airflow_context.context["dag"].owner if project_name: self._run.project_name = project_name if airflow_user: self._run.context.task_run_env.user = airflow_user if airflow_context.is_subdag: root_run_uid = get_job_run_uid( airflow_instance_uid=airflow_context.airflow_instance_uid, dag_id=airflow_context.root_dag_id, execution_date=airflow_context.execution_date, ) self._run.root_run_info = RootRunInfo( root_run_uid=root_run_uid, root_task_run_uid=get_task_run_uid( run_uid=root_run_uid, dag_id=airflow_context.root_dag_id, task_id=airflow_context.dag_id.split(".")[-1], ), )
def dbnd_run_start_airflow_dag_task(dag_id, execution_date, task_id): run_uid = get_job_run_uid(dag_id=dag_id, execution_date=execution_date) # root_task_uid = get_task_run_uid(run_uid=run_uid, task_id="DAG") # task_uid = get_task_run_uid(run_uid=run_uid, task_id=task_id) # this will create databand run with driver and root tasks. # we need the "root" task to be the same between different airflow tasks invocations # since in dbnd we must have single root task, so we create "dummy" task with dag_id name inplace_run_manager = get_dbnd_inplace_run_manager() dr = inplace_run_manager.start(root_task_name="DAG", run_uid=run_uid, job_name=dag_id, airflow_context=True) # now create "operator" task for current task_id, # we can't actually run it, we even don't know when it's going to finish # current execution is inside the operator, this is the only thing we know class InplaceAirflowOperatorTask(Task): _conf__task_family = task_id execution_date = dr.execution_date task = InplaceAirflowOperatorTask(task_version="now", task_name=task_id) tr = dr.create_dynamic_task_run(task, dr.local_engine, _uuid=get_task_run_uid(run_uid, task_id)) inplace_run_manager._start_taskrun(tr) return dr
def start( self, root_task_name, in_memory=True, run_uid=None, airflow_context=False, job_name=None, ): if try_get_databand_context(): return if not airflow_context and not self._atexit_registered: atexit.register(self.stop) if is_airflow_enabled(): from airflow.settings import dispose_orm atexit.unregister(dispose_orm) c = { "run": { "skip_completed": False }, # we don't want to "check" as script is task_version="now" "task": { "task_in_memory_outputs": in_memory }, # do not save any outputs } config.set_values(config_values=c, override=True, source="dbnd_start") context_kwargs = {"name": "airflow"} if airflow_context else {} # create databand context dc = self._enter_cm( new_dbnd_context(**context_kwargs)) # type: DatabandContext root_task = _build_inline_root_task(root_task_name, airflow_context=airflow_context) # create databand run dr = self._enter_cm( new_databand_run( context=dc, task_or_task_name=root_task, run_uid=run_uid, existing_run=False, job_name=job_name, )) # type: DatabandRun if run_uid: root_task_run_uid = get_task_run_uid(run_uid, root_task_name) else: root_task_run_uid = None dr._init_without_run(root_task_run_uid=root_task_run_uid) self._start_taskrun(dr.driver_task_run) self._start_taskrun(dr.root_task_run) return dr
def get_task_run_uid_for_inline_script(tracking_env, script_name): # type: (Dict[str, str], str) -> (str, UUID, UUID) """ Calculate task run uid and task run attempt uid for inline script execution :param tracking_env: dict with airflow context env :param script_name: inline script name :return: """ task_id = get_task_family_for_inline_script( tracking_env["AIRFLOW_CTX_TASK_ID"], script_name) run_uid = uuid.UUID(tracking_env[DBND_ROOT_RUN_UID]) dag_id = tracking_env["AIRFLOW_CTX_DAG_ID"] task_run_uid = get_task_run_uid(run_uid, dag_id, task_id) task_run_attempt_uid = get_task_run_attempt_uid( run_uid, dag_id, task_id, tracking_env["AIRFLOW_CTX_TRY_NUMBER"]) return task_id, task_run_uid, task_run_attempt_uid
def generate_task_run_uid(self, run, task, task_af_id): runtime_af = (_CURRENT_AIRFLOW_TRACKING_MANAGER. airflow_operator__task_run.task_af_id) return get_task_run_uid(run.run_uid, self.dag_id, "%s.%s" % (runtime_af, task_af_id))
def generate_task_run_uid(self, run, task, task_af_id): return get_task_run_uid(run.run_uid, self.dag_id, task_af_id)
def __init__(self, af_context): # type: (AirflowTaskContext) -> None self.run_uid = get_job_run_uid( dag_id=af_context.root_dag_id, execution_date=af_context.execution_date) self.dag_id = af_context.dag_id # this is the real operator uid, we need to connect to it with our "tracked" task, # so the moment monitor is on -> we can sync af_runtime_op_task_id = af_context.task_id self.af_operator_sync__task_run_uid = get_task_run_uid( self.run_uid, af_context.dag_id, af_runtime_op_task_id) # 1. create proper DatabandContext so we can create other objects set_tracking_config_overide( use_dbnd_log=override_airflow_log_system_for_tracking()) # create databand context with new_dbnd_context(name="airflow") as dc: # type: DatabandContext # now create "operator" task for current task_id, # we can't actually run it, we even don't know when it's going to finish # current execution is inside the operator, this is the only thing we know # STATE AFTER INIT: # AirflowOperator__runtime -> DAG__runtime task_target_date = pendulum.parse(af_context.execution_date, tz=pytz.UTC).date() # AIRFLOW OPERATOR RUNTIME af_runtime_op = AirflowOperatorRuntimeTask( task_family=task_name_for_runtime(af_runtime_op_task_id), dag_id=af_context.dag_id, execution_date=af_context.execution_date, task_target_date=task_target_date, task_version="%s:%s" % (af_runtime_op_task_id, af_context.execution_date), ) # this is the real operator uid, we need to connect to it with our "tracked" task, # so the moment monitor is on -> we can sync af_db_op_task_run_uid = get_task_run_uid(self.run_uid, af_context.dag_id, af_runtime_op_task_id) af_runtime_op.task_meta.extra_parents_task_run_uids.add( af_db_op_task_run_uid) af_runtime_op.ctrl.force_task_run_uid = TaskRunUidGen_TaskAfId( af_context.dag_id) self.af_operator_runtime__task = af_runtime_op # AIRFLOW DAG RUNTIME self.af_dag_runtime__task = AirflowDagRuntimeTask( task_name=task_name_for_runtime(DAG_SPECIAL_TASK_ID), dag_id=af_context.root_dag_id, # <- ROOT DAG! execution_date=af_context.execution_date, task_target_date=task_target_date, ) _add_child(self.af_dag_runtime__task, self.af_operator_runtime__task) # this will create databand run with driver and root tasks. # we need the "root" task to be the same between different airflow tasks invocations # since in dbnd we must have single root task, so we create "dummy" task with dag_id name # create databand run # we will want to preserve with new_databand_run( context=dc, task_or_task_name=self.af_dag_runtime__task, run_uid=self.run_uid, existing_run=False, job_name=af_context.root_dag_id, send_heartbeat=False, # we don't send heartbeat in tracking source=UpdateSource.airflow_tracking, ) as dr: self.dr = dr dr._init_without_run() self.airflow_operator__task_run = dr.get_task_run_by_id( af_runtime_op.task_id)
def execute(self, context): logger.debug("Running dbnd dbnd_task from airflow operator %s", self.task_id) dag = context["dag"] execution_date = context["execution_date"] dag_id = dag.dag_id run_uid = get_job_run_uid(dag_id=dag_id, execution_date=execution_date) dag_task_run_uid = get_task_run_uid(run_uid, dag_id) task_run_uid = get_task_run_uid(run_uid, self.task_id) # Airflow has updated all relevant fields in Operator definition with XCom values # now we can create a real dbnd dbnd_task with real references to dbnd_task new_kwargs = {} for p_name in self.dbnd_task_params_fields: new_kwargs[p_name] = getattr(self, p_name, None) # this is the real input value after if p_name in self.dbnd_xcom_inputs: new_kwargs[p_name] = target(new_kwargs[p_name]) new_kwargs["_dbnd_disable_airflow_inplace"] = True dag_ctrl = self.get_dbnd_dag_ctrl() with DatabandContext.context(_context=dag_ctrl.dbnd_context) as dc: logger.debug("Running %s with kwargs=%s ", self.task_id, new_kwargs) dbnd_task = dc.task_instance_cache.get_task_by_id( self.dbnd_task_id) # rebuild task with new values coming from xcom->operator with dbnd_task.ctrl.task_context(phase=TaskContextPhase.BUILD): dbnd_task = dbnd_task.clone(**new_kwargs) logger.debug("Creating inplace databand run for driver dump") dag_task = Task(task_name=dag.dag_id, task_target_date=execution_date) dag_task.set_upstream(dbnd_task) # create databand run with new_databand_run( context=dc, task_or_task_name=dag_task, run_uid=run_uid, existing_run=False, job_name=dag.dag_id, ) as dr: # type: DatabandRun dr._init_without_run(root_task_run_uid=dag_task_run_uid) # dr.driver_task_run.set_task_run_state(state=TaskRunState.RUNNING) # "make dag run" # dr.root_task_run.set_task_run_state(state=TaskRunState.RUNNING) dbnd_task_run = dr.get_task_run_by_id(dbnd_task.task_id) needs_databand_run_save = dbnd_task._conf__require_run_dump_file if needs_databand_run_save: dr.save_run() logger.info( dbnd_task.ctrl.banner("Running task '%s'." % dbnd_task.task_name, color="cyan")) # should be replaced with tr._execute call dbnd_task_run.runner.execute(airflow_context=context, handle_sigterm=False) logger.debug("Finished to run %s", self) result = { output_name: convert_to_safe_types(getattr(dbnd_task, output_name)) for output_name in self.dbnd_xcom_outputs } return result
def build_run_time_airflow_task(af_context, root_task_name): # type: (AirflowTaskContext, Optional[str]) -> Tuple[TrackingTask, str, UpdateSource, UUID] if af_context.context: # we are in the execute entry point and therefore that task name is <task>__execute task_family = af_context.task_id airflow_operator = af_context.context["task_instance"].task # find the template fields of the operators user_params = get_flatten_operator_params(airflow_operator) source_code = NO_SOURCE_CODE if is_instance_by_class_name(airflow_operator, "PythonOperator"): tracked_function = airflow_operator.python_callable user_params["function_name"] = tracked_function.__name__ source_code = TaskSourceCode.from_callable(tracked_function) else: # if this is an inline run-time task, we name it after the script which ran it # If we ever get here, the root_task_name will be just "airflow" since this is what we pass task_family = get_task_family_for_inline_script( af_context.task_id, root_task_name) source_code = TaskSourceCode.from_callstack() user_params = {} user_params.update( dag_id=af_context.dag_id, execution_date=af_context.execution_date, task_version="%s:%s" % (af_context.task_id, af_context.execution_date), ) # just a placeholder name task_passport = TaskPassport.from_module(task_family) task_definition_uid = get_task_def_uid( af_context.dag_id, task_family, "{}{}".format( source_md5(source_code.task_source_code), source_md5(source_code.task_module_code), ), ) root_task = TrackingTask.for_user_params( task_definition_uid=task_definition_uid, task_name=task_family, task_passport=task_passport, source_code=source_code, user_params=user_params, ) # type: TrackingTask root_task.ctrl.task_repr.task_functional_call = "" root_task.ctrl.task_repr.task_command_line = generate_airflow_cmd( dag_id=af_context.dag_id, task_id=af_context.task_id, execution_date=af_context.execution_date, is_root_task=False, ) root_run_uid = get_job_run_uid( airflow_instance_uid=af_context.airflow_instance_uid, dag_id=af_context.dag_id, execution_date=af_context.execution_date, ) root_task.ctrl.force_task_run_uid = get_task_run_uid( run_uid=root_run_uid, dag_id=af_context.dag_id, task_id=task_family) job_name = af_context.dag_id source = UpdateSource.airflow_tracking return root_task, job_name, source, root_run_uid