コード例 #1
0
    def update_run_from_airflow_context(self, airflow_context):
        if not airflow_context or not airflow_context.context:
            return

        dag = airflow_context.context.get("dag", None)
        if not dag:
            return

        dag_tags = getattr(dag, "tags", [])
        project_name = get_project_name_from_airflow_tags(dag_tags)
        airflow_user = airflow_context.context["dag"].owner

        if project_name:
            self._run.project_name = project_name

        if airflow_user:
            self._run.context.task_run_env.user = airflow_user

        if airflow_context.is_subdag:
            root_run_uid = get_job_run_uid(
                airflow_instance_uid=airflow_context.airflow_instance_uid,
                dag_id=airflow_context.root_dag_id,
                execution_date=airflow_context.execution_date,
            )
            self._run.root_run_info = RootRunInfo(
                root_run_uid=root_run_uid,
                root_task_run_uid=get_task_run_uid(
                    run_uid=root_run_uid,
                    dag_id=airflow_context.root_dag_id,
                    task_id=airflow_context.dag_id.split(".")[-1],
                ),
            )
コード例 #2
0
def dbnd_run_start_airflow_dag_task(dag_id, execution_date, task_id):
    run_uid = get_job_run_uid(dag_id=dag_id, execution_date=execution_date)
    # root_task_uid = get_task_run_uid(run_uid=run_uid, task_id="DAG")
    # task_uid = get_task_run_uid(run_uid=run_uid, task_id=task_id)

    # this will create databand run with driver and root tasks.
    # we need the "root" task to be the same between different airflow tasks invocations
    # since in dbnd we must have single root task, so we create "dummy" task with dag_id name

    inplace_run_manager = get_dbnd_inplace_run_manager()
    dr = inplace_run_manager.start(root_task_name="DAG",
                                   run_uid=run_uid,
                                   job_name=dag_id,
                                   airflow_context=True)

    # now create "operator" task for current task_id,
    # we can't actually run it, we even don't know when it's going to finish
    # current execution is inside the operator, this is the only thing we know
    class InplaceAirflowOperatorTask(Task):
        _conf__task_family = task_id
        execution_date = dr.execution_date

    task = InplaceAirflowOperatorTask(task_version="now", task_name=task_id)
    tr = dr.create_dynamic_task_run(task,
                                    dr.local_engine,
                                    _uuid=get_task_run_uid(run_uid, task_id))
    inplace_run_manager._start_taskrun(tr)
    return dr
コード例 #3
0
ファイル: inplace_run_manager.py プロジェクト: turbaszek/dbnd
    def start(
        self,
        root_task_name,
        in_memory=True,
        run_uid=None,
        airflow_context=False,
        job_name=None,
    ):
        if try_get_databand_context():
            return

        if not airflow_context and not self._atexit_registered:
            atexit.register(self.stop)
            if is_airflow_enabled():
                from airflow.settings import dispose_orm

                atexit.unregister(dispose_orm)
        c = {
            "run": {
                "skip_completed": False
            },  # we don't want to "check" as script is task_version="now"
            "task": {
                "task_in_memory_outputs": in_memory
            },  # do not save any outputs
        }
        config.set_values(config_values=c, override=True, source="dbnd_start")
        context_kwargs = {"name": "airflow"} if airflow_context else {}
        # create databand context
        dc = self._enter_cm(
            new_dbnd_context(**context_kwargs))  # type: DatabandContext

        root_task = _build_inline_root_task(root_task_name,
                                            airflow_context=airflow_context)
        # create databand run
        dr = self._enter_cm(
            new_databand_run(
                context=dc,
                task_or_task_name=root_task,
                run_uid=run_uid,
                existing_run=False,
                job_name=job_name,
            ))  # type: DatabandRun

        if run_uid:
            root_task_run_uid = get_task_run_uid(run_uid, root_task_name)
        else:
            root_task_run_uid = None
        dr._init_without_run(root_task_run_uid=root_task_run_uid)

        self._start_taskrun(dr.driver_task_run)
        self._start_taskrun(dr.root_task_run)
        return dr
コード例 #4
0
def get_task_run_uid_for_inline_script(tracking_env, script_name):
    # type: (Dict[str, str], str) -> (str, UUID, UUID)
    """
    Calculate task run uid and task run attempt uid for inline script execution
    :param tracking_env: dict with airflow context env
    :param script_name: inline script name
    :return:
    """
    task_id = get_task_family_for_inline_script(
        tracking_env["AIRFLOW_CTX_TASK_ID"], script_name)
    run_uid = uuid.UUID(tracking_env[DBND_ROOT_RUN_UID])
    dag_id = tracking_env["AIRFLOW_CTX_DAG_ID"]
    task_run_uid = get_task_run_uid(run_uid, dag_id, task_id)
    task_run_attempt_uid = get_task_run_attempt_uid(
        run_uid, dag_id, task_id, tracking_env["AIRFLOW_CTX_TRY_NUMBER"])
    return task_id, task_run_uid, task_run_attempt_uid
コード例 #5
0
 def generate_task_run_uid(self, run, task, task_af_id):
     runtime_af = (_CURRENT_AIRFLOW_TRACKING_MANAGER.
                   airflow_operator__task_run.task_af_id)
     return get_task_run_uid(run.run_uid, self.dag_id,
                             "%s.%s" % (runtime_af, task_af_id))
コード例 #6
0
 def generate_task_run_uid(self, run, task, task_af_id):
     return get_task_run_uid(run.run_uid, self.dag_id, task_af_id)
コード例 #7
0
    def __init__(self, af_context):
        # type: (AirflowTaskContext) -> None
        self.run_uid = get_job_run_uid(
            dag_id=af_context.root_dag_id,
            execution_date=af_context.execution_date)
        self.dag_id = af_context.dag_id
        # this is the real operator uid, we need to connect to it with our "tracked" task,
        # so the moment monitor is on -> we can sync
        af_runtime_op_task_id = af_context.task_id
        self.af_operator_sync__task_run_uid = get_task_run_uid(
            self.run_uid, af_context.dag_id, af_runtime_op_task_id)
        # 1. create proper DatabandContext so we can create other objects
        set_tracking_config_overide(
            use_dbnd_log=override_airflow_log_system_for_tracking())

        # create databand context
        with new_dbnd_context(name="airflow") as dc:  # type: DatabandContext

            # now create "operator" task for current task_id,
            # we can't actually run it, we even don't know when it's going to finish
            # current execution is inside the operator, this is the only thing we know
            # STATE AFTER INIT:
            # AirflowOperator__runtime ->  DAG__runtime
            task_target_date = pendulum.parse(af_context.execution_date,
                                              tz=pytz.UTC).date()
            # AIRFLOW OPERATOR RUNTIME

            af_runtime_op = AirflowOperatorRuntimeTask(
                task_family=task_name_for_runtime(af_runtime_op_task_id),
                dag_id=af_context.dag_id,
                execution_date=af_context.execution_date,
                task_target_date=task_target_date,
                task_version="%s:%s" %
                (af_runtime_op_task_id, af_context.execution_date),
            )

            # this is the real operator uid, we need to connect to it with our "tracked" task,
            # so the moment monitor is on -> we can sync
            af_db_op_task_run_uid = get_task_run_uid(self.run_uid,
                                                     af_context.dag_id,
                                                     af_runtime_op_task_id)
            af_runtime_op.task_meta.extra_parents_task_run_uids.add(
                af_db_op_task_run_uid)
            af_runtime_op.ctrl.force_task_run_uid = TaskRunUidGen_TaskAfId(
                af_context.dag_id)

            self.af_operator_runtime__task = af_runtime_op
            # AIRFLOW DAG RUNTIME
            self.af_dag_runtime__task = AirflowDagRuntimeTask(
                task_name=task_name_for_runtime(DAG_SPECIAL_TASK_ID),
                dag_id=af_context.root_dag_id,  # <- ROOT DAG!
                execution_date=af_context.execution_date,
                task_target_date=task_target_date,
            )
            _add_child(self.af_dag_runtime__task,
                       self.af_operator_runtime__task)

            # this will create databand run with driver and root tasks.
            # we need the "root" task to be the same between different airflow tasks invocations
            # since in dbnd we must have single root task, so we create "dummy" task with dag_id name

            # create databand run
            # we will want to preserve
            with new_databand_run(
                    context=dc,
                    task_or_task_name=self.af_dag_runtime__task,
                    run_uid=self.run_uid,
                    existing_run=False,
                    job_name=af_context.root_dag_id,
                    send_heartbeat=False,  # we don't send heartbeat in tracking
                    source=UpdateSource.airflow_tracking,
            ) as dr:
                self.dr = dr
                dr._init_without_run()
                self.airflow_operator__task_run = dr.get_task_run_by_id(
                    af_runtime_op.task_id)
コード例 #8
0
    def execute(self, context):
        logger.debug("Running dbnd dbnd_task from airflow operator %s",
                     self.task_id)

        dag = context["dag"]
        execution_date = context["execution_date"]
        dag_id = dag.dag_id
        run_uid = get_job_run_uid(dag_id=dag_id, execution_date=execution_date)

        dag_task_run_uid = get_task_run_uid(run_uid, dag_id)
        task_run_uid = get_task_run_uid(run_uid, self.task_id)

        # Airflow has updated all relevant fields in Operator definition with XCom values
        # now we can create a real dbnd dbnd_task with real references to dbnd_task
        new_kwargs = {}
        for p_name in self.dbnd_task_params_fields:
            new_kwargs[p_name] = getattr(self, p_name, None)
            # this is the real input value after
            if p_name in self.dbnd_xcom_inputs:
                new_kwargs[p_name] = target(new_kwargs[p_name])

        new_kwargs["_dbnd_disable_airflow_inplace"] = True
        dag_ctrl = self.get_dbnd_dag_ctrl()
        with DatabandContext.context(_context=dag_ctrl.dbnd_context) as dc:
            logger.debug("Running %s with kwargs=%s ", self.task_id,
                         new_kwargs)
            dbnd_task = dc.task_instance_cache.get_task_by_id(
                self.dbnd_task_id)
            # rebuild task with new values coming from xcom->operator
            with dbnd_task.ctrl.task_context(phase=TaskContextPhase.BUILD):
                dbnd_task = dbnd_task.clone(**new_kwargs)

            logger.debug("Creating inplace databand run for driver dump")
            dag_task = Task(task_name=dag.dag_id,
                            task_target_date=execution_date)
            dag_task.set_upstream(dbnd_task)

            # create databand run
            with new_databand_run(
                    context=dc,
                    task_or_task_name=dag_task,
                    run_uid=run_uid,
                    existing_run=False,
                    job_name=dag.dag_id,
            ) as dr:  # type: DatabandRun
                dr._init_without_run(root_task_run_uid=dag_task_run_uid)

                # dr.driver_task_run.set_task_run_state(state=TaskRunState.RUNNING)
                # "make dag run"
                # dr.root_task_run.set_task_run_state(state=TaskRunState.RUNNING)
                dbnd_task_run = dr.get_task_run_by_id(dbnd_task.task_id)

                needs_databand_run_save = dbnd_task._conf__require_run_dump_file
                if needs_databand_run_save:
                    dr.save_run()

                logger.info(
                    dbnd_task.ctrl.banner("Running task '%s'." %
                                          dbnd_task.task_name,
                                          color="cyan"))
                # should be replaced with  tr._execute call
                dbnd_task_run.runner.execute(airflow_context=context,
                                             handle_sigterm=False)

        logger.debug("Finished to run %s", self)
        result = {
            output_name: convert_to_safe_types(getattr(dbnd_task, output_name))
            for output_name in self.dbnd_xcom_outputs
        }
        return result
コード例 #9
0
def build_run_time_airflow_task(af_context, root_task_name):
    # type: (AirflowTaskContext, Optional[str]) -> Tuple[TrackingTask, str, UpdateSource, UUID]
    if af_context.context:
        # we are in the execute entry point and therefore that task name is <task>__execute
        task_family = af_context.task_id

        airflow_operator = af_context.context["task_instance"].task

        # find the template fields of the operators
        user_params = get_flatten_operator_params(airflow_operator)

        source_code = NO_SOURCE_CODE
        if is_instance_by_class_name(airflow_operator, "PythonOperator"):
            tracked_function = airflow_operator.python_callable
            user_params["function_name"] = tracked_function.__name__
            source_code = TaskSourceCode.from_callable(tracked_function)
    else:
        # if this is an inline run-time task, we name it after the script which ran it
        # If we ever get here, the root_task_name will be just "airflow" since this is what we pass
        task_family = get_task_family_for_inline_script(
            af_context.task_id, root_task_name)
        source_code = TaskSourceCode.from_callstack()
        user_params = {}

    user_params.update(
        dag_id=af_context.dag_id,
        execution_date=af_context.execution_date,
        task_version="%s:%s" % (af_context.task_id, af_context.execution_date),
    )

    # just a placeholder name
    task_passport = TaskPassport.from_module(task_family)
    task_definition_uid = get_task_def_uid(
        af_context.dag_id,
        task_family,
        "{}{}".format(
            source_md5(source_code.task_source_code),
            source_md5(source_code.task_module_code),
        ),
    )
    root_task = TrackingTask.for_user_params(
        task_definition_uid=task_definition_uid,
        task_name=task_family,
        task_passport=task_passport,
        source_code=source_code,
        user_params=user_params,
    )  # type: TrackingTask

    root_task.ctrl.task_repr.task_functional_call = ""
    root_task.ctrl.task_repr.task_command_line = generate_airflow_cmd(
        dag_id=af_context.dag_id,
        task_id=af_context.task_id,
        execution_date=af_context.execution_date,
        is_root_task=False,
    )

    root_run_uid = get_job_run_uid(
        airflow_instance_uid=af_context.airflow_instance_uid,
        dag_id=af_context.dag_id,
        execution_date=af_context.execution_date,
    )
    root_task.ctrl.force_task_run_uid = get_task_run_uid(
        run_uid=root_run_uid, dag_id=af_context.dag_id, task_id=task_family)

    job_name = af_context.dag_id
    source = UpdateSource.airflow_tracking
    return root_task, job_name, source, root_run_uid