Example #1
0
    def _build_driver_task(self):
        if self.submit_driver and not self.existing_run:
            logger.info("Submitting job to remote execution")
            task_name = SystemTaskName.driver_submit
            is_submitter = True
            is_driver = False
            host_engine = self.local_engine.clone(require_submit=False)
            target_engine = self.local_engine.clone(require_submit=False)
            task_executor_type = TaskExecutorType.local
        else:
            task_name = SystemTaskName.driver
            is_submitter = not self.existing_run or self.resubmit_run
            is_driver = True
            task_executor_type = self.task_executor_type

            if self.submit_driver:
                # submit drive is true, but we are in existing run:
                # we are after the jump from submit to driver execution (to remote engine)
                host_engine = self.remote_engine.clone(require_submit=False)
            else:
                host_engine = self.local_engine.clone(
                    require_submit=False
                )  # we are running at this engine already

            target_engine = self.remote_engine
            if not self.submit_tasks or task_executor_type == "airflow_kubernetes":
                target_engine = target_engine.clone(require_submit=False)

        dbnd_local_root = host_engine.dbnd_local_root or self.env.dbnd_local_root
        run_folder_prefix = self.run_folder_prefix

        local_driver_root = dbnd_local_root.folder(run_folder_prefix)
        local_driver_log = local_driver_root.partition("%s.log" % task_name)

        remote_driver_root = self.env.dbnd_root.folder(run_folder_prefix)
        driver_dump = remote_driver_root.file("%s.pickle" % task_name)

        driver_task = _DbndDriverTask(
            task_name=task_name,
            task_version=self.run_uid,
            execution_date=self.execution_date,
            is_submitter=is_submitter,
            is_driver=is_driver,
            host_engine=host_engine,
            target_engine=target_engine,
            task_executor_type=task_executor_type,
            local_driver_root=local_driver_root,
            local_driver_log=local_driver_log,
            remote_driver_root=remote_driver_root,
            driver_dump=driver_dump,
            sends_heartbeat=self.sends_heartbeat,
        )

        tr = TaskRun(task=driver_task,
                     run=self,
                     task_engine=driver_task.host_engine)
        self._add_task_run(tr)
        return tr
Example #2
0
    def _build_and_add_task_run(self, task, task_engine=None, task_af_id=None):
        if task_af_id is None:
            task_af_id = self.next_af_task_name(task)

        tr = TaskRun(
            task=task,
            run=self,
            task_engine=task_engine or self.local_engine,
            task_af_id=task_af_id,
        )
        self._add_task_run(tr)
        return tr
Example #3
0
    def create_dynamic_task_run(self, task, task_engine, task_af_id=None):
        if task_af_id is None:
            task_af_id = self.next_af_task_name(task)

        tr = TaskRun(
            task=task,
            run=self,
            is_dynamic=True,
            task_engine=task_engine,
            task_af_id=task_af_id,
        )
        self.add_task_runs_and_track([tr])
        return tr
Example #4
0
    def build_and_set_driver_task_run(self, driver_task, driver_engine=None):
        """
        set driver task run which is used to "track" main execution flow
        Tracking: will track pipeline progress
        Orchestration: will track and "execute" pipeline

        you need to run DatabandRun.init_run otherwise it will be not tracked
        """
        self._driver_task_run = TaskRun(task=driver_task,
                                        run=self,
                                        task_engine=driver_engine
                                        or self.local_engine)
        self._add_task_run(self._driver_task_run)
        return self._driver_task_run
Example #5
0
 def create_dynamic_task_run(self,
                             task,
                             task_engine,
                             _uuid=None,
                             task_af_id=None):
     tr = TaskRun(
         task=task,
         run=self,
         is_dynamic=True,
         task_engine=task_engine,
         _uuid=_uuid,
         task_af_id=task_af_id,
     )
     self.add_task_runs([tr])
     return tr
Example #6
0
    def create_task_run_at_execution_time(self,
                                          task,
                                          task_engine,
                                          task_af_id=None):
        if task_af_id is None:
            task_af_id = self.next_af_task_name(task)

        if self.af_context:
            task_af_id = "_".join([self.af_context.task_id, task_af_id])

        tr = TaskRun(
            task=task,
            run=self,
            is_dynamic=True,
            task_engine=task_engine,
            task_af_id=task_af_id,
        )
        self.add_task_runs_and_track([tr])
        return tr
Example #7
0
    def create_dynamic_task_run(self, task, task_engine, task_af_id=None):
        if task_af_id is None:
            task_name = task.friendly_task_name
            if task_name in self.dynamic_af_tasks_count:
                self.dynamic_af_tasks_count[task_name] += 1
                task_af_id = "{}_{}".format(
                    task_name, self.dynamic_af_tasks_count[task_name])
            else:
                self.dynamic_af_tasks_count[task_name] = 1
                task_af_id = task_name

        tr = TaskRun(
            task=task,
            run=self,
            is_dynamic=True,
            task_engine=task_engine,
            task_af_id=task_af_id,
        )
        self.add_task_runs([tr])
        return tr
Example #8
0
    def build_task_runs(self,
                        run,
                        root_task,
                        remote_engine,
                        root_task_run_uid=None):
        # type: (DatabandRun, Task, EngineConfig, UUID) -> List[TaskRun]
        run_config = run.context.settings.run  # type: RunConfig

        # first, let remove all tasks explicitly marked as disabled by user
        tasks_to_run, tasks_disabled = self.get_tasks_without_disabled(
            root_task)
        if tasks_disabled:
            logger.info(
                "Tasks were removed from the task graph as they are marked as not to run: %s",
                tasks_summary(tasks_disabled),
            )

        roots = [root_task]
        tasks_skipped = set()
        # in case we need to run only part of the graph we mark all other tasks as skipped
        if run_config.task or run_config.id:
            task_dag = root_task.ctrl.task_dag  # type: _TaskDagNode
            if run_config.task:
                roots = task_dag.select_by_task_names(run_config.task,
                                                      tasks=tasks_to_run)
            elif run_config.id:
                roots = task_dag.select_by_task_ids(run_config.id,
                                                    tasks=tasks_to_run)

            tasks_skipped = tasks_to_run.difference(all_subdags(roots))

        enabled_tasks = tasks_to_run.difference(tasks_skipped)

        tasks_completed = set()
        task_skipped_as_not_required = set()
        if run_config.skip_completed:
            tasks_completed, task_skipped_as_not_required = find_tasks_to_skip_complete(
                roots, enabled_tasks)

        # # if any of the tasks is spark add policy
        # from dbnd._core.task.spark import _BaseSparkTask
        # for t in tasks_to_run:
        #     if isinstance(t, _BaseSparkTask):
        #         t.spark.apply_spark_cluster_policy(t)
        # bash_op = BashOperator(task_id="echo", bash_command="echo hi")
        # self.root_task.set_upstream(bash_op.task)

        friendly_ids = calculate_friendly_task_ids(tasks_to_run)

        completed_ids = tasks_to_ids_set(tasks_completed)
        task_skipped_as_not_required_ids = tasks_to_ids_set(
            task_skipped_as_not_required)
        skipped_ids = tasks_to_ids_set(tasks_skipped)

        task_runs = []
        for task in tasks_to_run:

            with task.ctrl.task_context(phase=TaskContextPhase.BUILD):
                # we want to have configuration with task overrides
                task_engine = build_task_from_config(
                    task_name=remote_engine.task_name)
                task_engine.require_submit = remote_engine.require_submit
                task_run = TaskRun(
                    run=run,
                    task=task,
                    task_af_id=friendly_ids[task.task_id],
                    task_engine=task_engine,
                    _uuid=root_task_run_uid
                    if task.task_id == root_task.task_id else None,
                )
            if task.task_id in completed_ids:
                task_run.is_reused = True

            if task.task_id in task_skipped_as_not_required_ids:
                task_run.is_reused = True
                task_run.is_skipped_as_not_required = True

            if task.task_id in skipped_ids:
                task_run.is_skipped = True

            if task.task_id == root_task.task_id:
                task_run.is_root = True

            task_runs.append(task_run)

        return task_runs