def _sig(task):
    with new_databand_run(context=get_databand_context(), job_name=task.task_name):
        for child in [task] + list(task.descendants.get_children()):
            name = "signature %s" % child.task_name
            logger.info(child.ctrl.banner(name))

    return task.task_signature
    def _enter_databand_run(self, scheduled_tasks=None):
        self.init_context()

        if scheduled_tasks and not self.root_dbnd_task:
            # This code is reached only when an orphaned task is executed
            self._root_dbnd_task = self.get_dbnd_task(
                first(scheduled_tasks.values()))

        self._databand_run = self._enter_cm(
            new_databand_run(context=self._databand_context,
                             task_or_task_name=self.root_dbnd_task))
        # Start the tracking process
        self._databand_run._init_without_run()
        self._driver_task_run = self._databand_run.driver_task_run
        print_tasks_tree(self._databand_run.root_task_run.task,
                         self._databand_run.task_runs)

        self._enter_cm(
            self._driver_task_run.runner.task_run_execution_context())

        for task in self._missing_dep:
            task.current_task_run.set_task_run_state(
                state=TaskRunState.UPSTREAM_FAILED)

        if not self.get_non_finished_sub_tasks():
            self.finish_run(TaskRunState.FAILED)
        else:
            self._driver_task_run.set_task_run_state(
                state=TaskRunState.RUNNING)
Exemple #3
0
    def dbnd_run_task(
        self,
        task_or_task_name,
        run_uid=None,
        scheduled_run_info=None,
        send_heartbeat=True,
    ):  # type: (Optional[Task,str], Optional[UUID], ScheduledRunInfo, bool) -> DatabandRun
        """
        This is the main entry point to run task in "dbnd orchestration" mode
        called from `dbnd run`
        we create a new Run + RunExecutor and trigger the execution

        :param task_or_task_name task name to run or already built task object
        :return DatabandRun
        """
        job_name = get_task_name_safe(task_or_task_name)
        with new_databand_run(
                context=self,
                job_name=job_name,
                run_uid=run_uid,
                scheduled_run_info=scheduled_run_info,
                is_orchestration=True,
        ) as run:  # type: DatabandRun
            # this is the main entry point to run some task in "orchestration" mode
            run.run_executor = RunExecutor(
                run=run,
                root_task_or_task_name=task_or_task_name,
                send_heartbeat=send_heartbeat,
            )
            run.run_executor.run_execute()
            return run
Exemple #4
0
def _sig(task):
    name = "signature %s" % task.task_name
    with new_databand_run(context=get_databand_context(),
                          task_or_task_name=task):
        logger.info(task.ctrl.banner(name))
        for child in task.task_meta.get_children():
            logger.info(child.ctrl.banner(name))

    return task.task_signature
    def start(
        self,
        root_task_name,
        in_memory=True,
        run_uid=None,
        airflow_context=False,
        job_name=None,
    ):
        if try_get_databand_context():
            return

        if not airflow_context and not self._atexit_registered:
            atexit.register(self.stop)
            if is_airflow_enabled():
                from airflow.settings import dispose_orm

                atexit.unregister(dispose_orm)
        c = {
            "run": {
                "skip_completed": False
            },  # we don't want to "check" as script is task_version="now"
            "task": {
                "task_in_memory_outputs": in_memory
            },  # do not save any outputs
        }
        config.set_values(config_values=c, override=True, source="dbnd_start")
        context_kwargs = {"name": "airflow"} if airflow_context else {}
        # create databand context
        dc = self._enter_cm(
            new_dbnd_context(**context_kwargs))  # type: DatabandContext

        root_task = _build_inline_root_task(root_task_name,
                                            airflow_context=airflow_context)
        # create databand run
        dr = self._enter_cm(
            new_databand_run(
                context=dc,
                task_or_task_name=root_task,
                run_uid=run_uid,
                existing_run=False,
                job_name=job_name,
            ))  # type: DatabandRun

        if run_uid:
            root_task_run_uid = get_task_run_uid(run_uid, root_task_name)
        else:
            root_task_run_uid = None
        dr._init_without_run(root_task_run_uid=root_task_run_uid)

        self._start_taskrun(dr.driver_task_run)
        self._start_taskrun(dr.root_task_run)
        return dr
    def start(self, root_task_name=None, airflow_context=None):
        if self._run or self._active or try_get_databand_run():
            return

        # we probably should use only airlfow context via parameter.
        # also, there are mocks that cover only get_dbnd_project_config().airflow_context
        airflow_context = airflow_context or get_dbnd_project_config().airflow_context()
        set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context)

        dc = self._enter_cm(
            new_dbnd_context(name="inplace_tracking")
        )  # type: DatabandContext

        if airflow_context:
            root_task, job_name, source = build_run_time_airflow_task(airflow_context)
        else:
            root_task = _build_inline_root_task(root_task_name)
            job_name = root_task.task_name
            source = UpdateSource.dbnd

        self._run = run = self._enter_cm(
            new_databand_run(
                context=dc,
                job_name=job_name,
                existing_run=False,
                source=source,
                af_context=airflow_context,
            )
        )  # type: DatabandRun
        self._run.root_task = root_task

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        root_task_run = run._build_and_add_task_run(root_task)
        root_task_run.is_root = True

        # No need to track the state because we track in init_run
        run.root_task_run.set_task_run_state(TaskRunState.RUNNING, track=False)
        run.tracker.init_run()

        self._enter_cm(run.root_task_run.runner.task_run_execution_context())
        self._task_run = run.root_task_run

        return self._task_run
    def start(self, root_task_name, job_name=None):
        if self._run or self._active or try_get_databand_run():
            return

        airflow_context = try_get_airflow_context()
        set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context)

        # 1. create proper DatabandContext so we can create other objects
        dc = self._enter_cm(new_dbnd_context())  # type: DatabandContext

        if airflow_context:
            root_task_or_task_name = AirflowOperatorRuntimeTask.build_from_airflow_context(
                airflow_context
            )
            source = UpdateSource.airflow_tracking
            job_name = "{}.{}".format(airflow_context.dag_id, airflow_context.task_id)
        else:
            root_task_or_task_name = _build_inline_root_task(root_task_name)
            source = UpdateSource.dbnd

        # create databand run
        # this will create databand run with driver and root tasks.

        # create databand run
        # we will want to preserve
        self._run = self._enter_cm(
            new_databand_run(
                context=dc,
                task_or_task_name=root_task_or_task_name,
                job_name=job_name,
                existing_run=False,
                source=source,
                af_context=airflow_context,
            )
        )  # type: DatabandRun

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        self._run._init_without_run()

        self._start_taskrun(self._run.driver_task_run)
        self._start_taskrun(self._run.root_task_run)
        self._task_run = self._run.root_task_run
        return self._task_run
 def dbnd_run_task(
     self,
     task_or_task_name,
     run_uid=None,
     scheduled_run_info=None,
     send_heartbeat=True,
 ):
     # type: (Optional[Task,str], Optional[UUID], ScheduledRunInfo, bool) -> DatabandRun
     with new_databand_run(
             context=self,
             task_or_task_name=task_or_task_name,
             run_uid=run_uid,
             scheduled_run_info=scheduled_run_info,
             send_heartbeat=send_heartbeat,
     ) as run:  # type: DatabandRun
         run.run_driver()
         return run
Exemple #9
0
    def start(self, root_task_name, job_name=None):
        if self._run:
            return
        if self._started or self._disabled:  # started or failed
            return

        try:
            if try_get_databand_run():
                return

            self._started = True

            # 1. create proper DatabandContext so we can create other objects
            set_tracking_config_overide(use_dbnd_log=True)
            # create databand context
            dc = self._enter_cm(new_dbnd_context())  # type: DatabandContext

            root_task = _build_inline_root_task(root_task_name)

            # create databand run
            self._run = self._enter_cm(
                new_databand_run(
                    context=dc,
                    task_or_task_name=root_task,
                    existing_run=False,
                    job_name=job_name,
                ))  # type: DatabandRun

            self._run._init_without_run()

            if not self._atexit_registered:
                atexit.register(self.stop)
            sys.excepthook = self.stop_on_exception

            self._start_taskrun(self._run.driver_task_run)
            self._start_taskrun(self._run.root_task_run)
            self._task_run = self._run.root_task_run
            return self._task_run
        except Exception:
            _handle_inline_error("inline-start")
            self._disabled = True
            return
        finally:
            self._started = True
Exemple #10
0
    def start(self, root_task_name=None, airflow_context=None):
        if self._run or self._active or try_get_databand_run():
            return

        airflow_context = airflow_context or try_get_airflow_context()
        set_tracking_config_overide(use_dbnd_log=True,
                                    airflow_context=airflow_context)

        dc = self._enter_cm(new_dbnd_context())  # type: DatabandContext

        if airflow_context:
            root_task, job_name, source = build_run_time_airflow_task(
                airflow_context)
        else:
            root_task = _build_inline_root_task(root_task_name)
            job_name = None
            source = UpdateSource.dbnd

        self._run = self._enter_cm(
            new_databand_run(
                context=dc,
                task_or_task_name=root_task,
                job_name=job_name,
                existing_run=False,
                source=source,
                af_context=airflow_context,
                send_heartbeat=False,
            ))  # type: DatabandRun

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        self._run._init_without_run()
        self._start_taskrun(self._run.driver_task_run)
        self._start_taskrun(self._run.root_task_run)
        self._task_run = self._run.root_task_run

        return self._task_run
Exemple #11
0
    def dbnd_run_task(
            self,
            task_or_task_name,  # type: Union[Task, str]
            job_name=None,  # type: Optional[str]
            force_task_name=None,  # type: Optional[str]
            project=None,  # type: Optional[str]
            run_uid=None,  # type: Optional[UUID]
            existing_run=None,  # type: Optional[bool]
            scheduled_run_info=None,  # type: Optional[ScheduledRunInfo]
            send_heartbeat=True,  # type: bool
    ):  # type: (...) -> DatabandRun
        """
        This is the main entry point to run task in "dbnd orchestration" mode
        called from `dbnd run`
        we create a new Run + RunExecutor and trigger the execution

        :param task_or_task_name task name to run or already built task object
        :param force_task_name
        :param project Project name for the run
        :return DatabandRun
        """
        job_name = get_task_name_safe(job_name or task_or_task_name)
        project_name = self._get_project_name(project, task_or_task_name)

        with new_databand_run(
                context=self,
                job_name=job_name,
                run_uid=run_uid,
                existing_run=existing_run,
                scheduled_run_info=scheduled_run_info,
                is_orchestration=True,
                project_name=project_name,
        ) as run:  # type: DatabandRun
            # this is the main entry point to run some task in "orchestration" mode
            run.run_executor = RunExecutor(
                run=run,
                root_task_or_task_name=task_or_task_name,
                force_task_name=force_task_name,
                send_heartbeat=send_heartbeat,
            )
            run.run_executor.run_execute()
            return run
Exemple #12
0
    def _enter_databand_run(self, scheduled_tasks=None):
        self.init_context()

        if scheduled_tasks and not self.root_dbnd_task:
            # This code is reached only when an orphaned task is executed
            self.root_dbnd_task = self.get_dbnd_task(first(scheduled_tasks.values()))

        self._databand_run = run = self._enter_cm(
            new_databand_run(
                context=self._databand_context, job_name=self.root_dbnd_task.task_name
            )
        )  # type: DatabandRun

        self._driver_task_run = run.build_and_set_driver_task_run(
            driver_task=Task(task_name=SystemTaskName.driver, task_is_system=True)
        )

        self._driver_task_run.task.descendants.add_child(self.root_dbnd_task.task_id)

        # assign root task
        run.root_task = self.root_dbnd_task
        # Start the tracking process
        for task in run.root_task.task_dag.subdag_tasks():
            run._build_and_add_task_run(task)

        # we only update states not submitting data
        self._driver_task_run.set_task_run_state(
            state=TaskRunState.RUNNING, track=False
        )
        for task in self._missing_dep:
            task.current_task_run.set_task_run_state(
                state=TaskRunState.UPSTREAM_FAILED, track=False
            )
        run.tracker.init_run()

        self._enter_cm(self._driver_task_run.runner.task_run_execution_context())
        print_tasks_tree(run.root_task_run.task, run.task_runs)
        if not self.get_non_finished_sub_tasks():
            # we have no more tasks to run.. probably it's a failure
            self.finish_run(TaskRunState.FAILED)
            return
Exemple #13
0
def initialized_run(task_or_task_name):
    with new_databand_run(context=dbnd_context(),
                          task_or_task_name=task_or_task_name) as r:
        r._init_without_run()
        yield r
    def execute(self, context):
        logger.debug("Running dbnd dbnd_task from airflow operator %s", self.task_id)

        dag = context["dag"]
        execution_date = context["execution_date"]
        dag_id = dag.dag_id
        run_uid = get_job_run_uid(dag_id=dag_id, execution_date=execution_date)

        # Airflow has updated all relevant fields in Operator definition with XCom values
        # now we can create a real dbnd dbnd_task with real references to dbnd_task
        new_kwargs = {}
        for p_name in self.dbnd_task_params_fields:
            new_kwargs[p_name] = getattr(self, p_name, None)
            # this is the real input value after
            if p_name in self.dbnd_xcom_inputs:
                new_kwargs[p_name] = target(new_kwargs[p_name])

        new_kwargs["_dbnd_disable_airflow_inplace"] = True
        dag_ctrl = self.get_dbnd_dag_ctrl()
        with DatabandContext.context(_context=dag_ctrl.dbnd_context) as dc:
            logger.debug("Running %s with kwargs=%s ", self.task_id, new_kwargs)
            dbnd_task = dc.task_instance_cache.get_task_by_id(self.dbnd_task_id)
            # rebuild task with new values coming from xcom->operator
            with dbnd_task.ctrl.task_context(phase=TaskContextPhase.BUILD):
                dbnd_task = dbnd_task.clone(
                    output_params_to_clone=self.dbnd_overridden_output_params,
                    **new_kwargs
                )

            logger.debug("Creating inplace databand run for driver dump")
            dag_task = Task(task_name=dag.dag_id, task_target_date=execution_date)
            dag_task.set_upstream(dbnd_task)

            # create databand run
            with new_databand_run(
                context=dc,
                task_or_task_name=dag_task,
                run_uid=run_uid,
                existing_run=False,
                job_name=dag.dag_id,
            ) as dr:  # type: DatabandRun
                dr._init_without_run()

                # dr.driver_task_run.set_task_run_state(state=TaskRunState.RUNNING)
                # "make dag run"
                # dr.root_task_run.set_task_run_state(state=TaskRunState.RUNNING)
                dbnd_task_run = dr.get_task_run_by_id(dbnd_task.task_id)

                needs_databand_run_save = dbnd_task._conf__require_run_dump_file
                if needs_databand_run_save:
                    dr.save_run()

                logger.info(
                    dbnd_task.ctrl.banner(
                        "Running task '%s'." % dbnd_task.task_name, color="cyan"
                    )
                )
                # should be replaced with  tr._execute call
                dbnd_task_run.runner.execute(
                    airflow_context=context, handle_sigterm=False
                )

            logger.debug("Finished to run %s", self)
            result = {
                output_name: convert_to_safe_types(getattr(dbnd_task, output_name))
                for output_name in self.dbnd_xcom_outputs
            }
        return result
Exemple #15
0
    def start(self,
              root_task_name=None,
              project_name=None,
              airflow_context=None):
        if self._run or self._active or try_get_databand_run():
            return

        # we probably should use only airlfow context via parameter.
        # also, there are mocks that cover only get_dbnd_project_config().airflow_context
        airflow_context = airflow_context or get_dbnd_project_config(
        ).airflow_context()
        if airflow_context:
            _set_dbnd_config_from_airflow_connections()

        _set_tracking_config_overide(airflow_context=airflow_context)
        dc = self._enter_cm(
            new_dbnd_context(name="inplace_tracking"))  # type: DatabandContext

        if not root_task_name:
            # extract the name of the script we are running (in Airflow scenario it will be just "airflow")
            root_task_name = sys.argv[0].split(os.path.sep)[-1]

        if airflow_context:
            root_task, job_name, source, run_uid = build_run_time_airflow_task(
                airflow_context, root_task_name)
            try_number = airflow_context.try_number
        else:
            root_task = _build_inline_root_task(root_task_name)
            job_name = root_task_name
            source = UpdateSource.generic_tracking
            run_uid = None
            try_number = 1

        tracking_source = (
            None  # TODO_CORE build tracking_source -> typeof TrackingSourceSchema
        )
        self._run = run = self._enter_cm(
            new_databand_run(
                context=dc,
                job_name=job_name,
                run_uid=run_uid,
                existing_run=run_uid is not None,
                source=source,
                af_context=airflow_context,
                tracking_source=tracking_source,
                project_name=project_name,
            ))  # type: DatabandRun

        self._run.root_task = root_task

        self.update_run_from_airflow_context(airflow_context)

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        root_task_run = run._build_and_add_task_run(
            root_task, task_af_id=root_task.task_name, try_number=try_number)

        root_task_run.is_root = True

        run.tracker.init_run()
        run.root_task_run.set_task_run_state(TaskRunState.RUNNING)

        should_capture_log = TrackingConfig.from_databand_context(
        ).capture_tracking_log
        self._enter_cm(
            run.root_task_run.runner.task_run_execution_context(
                capture_log=should_capture_log, handle_sigterm=False))
        self._task_run = run.root_task_run

        return self._task_run
    def __init__(self, af_context):
        # type: (AirflowTaskContext) -> None
        self.run_uid = get_job_run_uid(
            dag_id=af_context.root_dag_id,
            execution_date=af_context.execution_date)
        self.dag_id = af_context.dag_id
        # this is the real operator uid, we need to connect to it with our "tracked" task,
        # so the moment monitor is on -> we can sync
        af_runtime_op_task_id = af_context.task_id
        self.af_operator_sync__task_run_uid = get_task_run_uid(
            self.run_uid, af_context.dag_id, af_runtime_op_task_id)
        # 1. create proper DatabandContext so we can create other objects
        set_tracking_config_overide(
            use_dbnd_log=override_airflow_log_system_for_tracking())

        # create databand context
        with new_dbnd_context(name="airflow") as dc:  # type: DatabandContext

            # now create "operator" task for current task_id,
            # we can't actually run it, we even don't know when it's going to finish
            # current execution is inside the operator, this is the only thing we know
            # STATE AFTER INIT:
            # AirflowOperator__runtime ->  DAG__runtime
            task_target_date = pendulum.parse(af_context.execution_date,
                                              tz=pytz.UTC).date()
            # AIRFLOW OPERATOR RUNTIME

            af_runtime_op = AirflowOperatorRuntimeTask(
                task_family=task_name_for_runtime(af_runtime_op_task_id),
                dag_id=af_context.dag_id,
                execution_date=af_context.execution_date,
                task_target_date=task_target_date,
                task_version="%s:%s" %
                (af_runtime_op_task_id, af_context.execution_date),
            )

            # this is the real operator uid, we need to connect to it with our "tracked" task,
            # so the moment monitor is on -> we can sync
            af_db_op_task_run_uid = get_task_run_uid(self.run_uid,
                                                     af_context.dag_id,
                                                     af_runtime_op_task_id)
            af_runtime_op.task_meta.extra_parents_task_run_uids.add(
                af_db_op_task_run_uid)
            af_runtime_op.ctrl.force_task_run_uid = TaskRunUidGen_TaskAfId(
                af_context.dag_id)

            self.af_operator_runtime__task = af_runtime_op
            # AIRFLOW DAG RUNTIME
            self.af_dag_runtime__task = AirflowDagRuntimeTask(
                task_name=task_name_for_runtime(DAG_SPECIAL_TASK_ID),
                dag_id=af_context.root_dag_id,  # <- ROOT DAG!
                execution_date=af_context.execution_date,
                task_target_date=task_target_date,
            )
            _add_child(self.af_dag_runtime__task,
                       self.af_operator_runtime__task)

            # this will create databand run with driver and root tasks.
            # we need the "root" task to be the same between different airflow tasks invocations
            # since in dbnd we must have single root task, so we create "dummy" task with dag_id name

            # create databand run
            # we will want to preserve
            with new_databand_run(
                    context=dc,
                    task_or_task_name=self.af_dag_runtime__task,
                    run_uid=self.run_uid,
                    existing_run=False,
                    job_name=af_context.root_dag_id,
                    send_heartbeat=False,  # we don't send heartbeat in tracking
                    source=UpdateSource.airflow_tracking,
            ) as dr:
                self.dr = dr
                dr._init_without_run()
                self.airflow_operator__task_run = dr.get_task_run_by_id(
                    af_runtime_op.task_id)