def _build_driver_task(self): if self.submit_driver and not self.existing_run: logger.info("Submitting job to remote execution") task_name = SystemTaskName.driver_submit is_submitter = True is_driver = False host_engine = self.local_engine.clone(require_submit=False) target_engine = self.local_engine.clone(require_submit=False) task_executor_type = TaskExecutorType.local else: task_name = SystemTaskName.driver is_submitter = not self.existing_run or self.resubmit_run is_driver = True task_executor_type = self.task_executor_type if self.submit_driver: # submit drive is true, but we are in existing run: # we are after the jump from submit to driver execution (to remote engine) host_engine = self.remote_engine.clone(require_submit=False) else: host_engine = self.local_engine.clone( require_submit=False ) # we are running at this engine already target_engine = self.remote_engine if not self.submit_tasks or task_executor_type == "airflow_kubernetes": target_engine = target_engine.clone(require_submit=False) dbnd_local_root = host_engine.dbnd_local_root or self.env.dbnd_local_root run_folder_prefix = self.run_folder_prefix local_driver_root = dbnd_local_root.folder(run_folder_prefix) local_driver_log = local_driver_root.partition("%s.log" % task_name) remote_driver_root = self.env.dbnd_root.folder(run_folder_prefix) driver_dump = remote_driver_root.file("%s.pickle" % task_name) driver_task = _DbndDriverTask( task_name=task_name, task_version=self.run_uid, execution_date=self.execution_date, is_submitter=is_submitter, is_driver=is_driver, host_engine=host_engine, target_engine=target_engine, task_executor_type=task_executor_type, local_driver_root=local_driver_root, local_driver_log=local_driver_log, remote_driver_root=remote_driver_root, driver_dump=driver_dump, sends_heartbeat=self.sends_heartbeat, ) tr = TaskRun(task=driver_task, run=self, task_engine=driver_task.host_engine) self._add_task_run(tr) return tr
def _build_and_add_task_run(self, task, task_engine=None, task_af_id=None): if task_af_id is None: task_af_id = self.next_af_task_name(task) tr = TaskRun( task=task, run=self, task_engine=task_engine or self.local_engine, task_af_id=task_af_id, ) self._add_task_run(tr) return tr
def create_dynamic_task_run(self, task, task_engine, task_af_id=None): if task_af_id is None: task_af_id = self.next_af_task_name(task) tr = TaskRun( task=task, run=self, is_dynamic=True, task_engine=task_engine, task_af_id=task_af_id, ) self.add_task_runs_and_track([tr]) return tr
def build_and_set_driver_task_run(self, driver_task, driver_engine=None): """ set driver task run which is used to "track" main execution flow Tracking: will track pipeline progress Orchestration: will track and "execute" pipeline you need to run DatabandRun.init_run otherwise it will be not tracked """ self._driver_task_run = TaskRun(task=driver_task, run=self, task_engine=driver_engine or self.local_engine) self._add_task_run(self._driver_task_run) return self._driver_task_run
def create_dynamic_task_run(self, task, task_engine, _uuid=None, task_af_id=None): tr = TaskRun( task=task, run=self, is_dynamic=True, task_engine=task_engine, _uuid=_uuid, task_af_id=task_af_id, ) self.add_task_runs([tr]) return tr
def create_task_run_at_execution_time(self, task, task_engine, task_af_id=None): if task_af_id is None: task_af_id = self.next_af_task_name(task) if self.af_context: task_af_id = "_".join([self.af_context.task_id, task_af_id]) tr = TaskRun( task=task, run=self, is_dynamic=True, task_engine=task_engine, task_af_id=task_af_id, ) self.add_task_runs_and_track([tr]) return tr
def create_dynamic_task_run(self, task, task_engine, task_af_id=None): if task_af_id is None: task_name = task.friendly_task_name if task_name in self.dynamic_af_tasks_count: self.dynamic_af_tasks_count[task_name] += 1 task_af_id = "{}_{}".format( task_name, self.dynamic_af_tasks_count[task_name]) else: self.dynamic_af_tasks_count[task_name] = 1 task_af_id = task_name tr = TaskRun( task=task, run=self, is_dynamic=True, task_engine=task_engine, task_af_id=task_af_id, ) self.add_task_runs([tr]) return tr
def build_task_runs(self, run, root_task, remote_engine, root_task_run_uid=None): # type: (DatabandRun, Task, EngineConfig, UUID) -> List[TaskRun] run_config = run.context.settings.run # type: RunConfig # first, let remove all tasks explicitly marked as disabled by user tasks_to_run, tasks_disabled = self.get_tasks_without_disabled( root_task) if tasks_disabled: logger.info( "Tasks were removed from the task graph as they are marked as not to run: %s", tasks_summary(tasks_disabled), ) roots = [root_task] tasks_skipped = set() # in case we need to run only part of the graph we mark all other tasks as skipped if run_config.task or run_config.id: task_dag = root_task.ctrl.task_dag # type: _TaskDagNode if run_config.task: roots = task_dag.select_by_task_names(run_config.task, tasks=tasks_to_run) elif run_config.id: roots = task_dag.select_by_task_ids(run_config.id, tasks=tasks_to_run) tasks_skipped = tasks_to_run.difference(all_subdags(roots)) enabled_tasks = tasks_to_run.difference(tasks_skipped) tasks_completed = set() task_skipped_as_not_required = set() if run_config.skip_completed: tasks_completed, task_skipped_as_not_required = find_tasks_to_skip_complete( roots, enabled_tasks) # # if any of the tasks is spark add policy # from dbnd._core.task.spark import _BaseSparkTask # for t in tasks_to_run: # if isinstance(t, _BaseSparkTask): # t.spark.apply_spark_cluster_policy(t) # bash_op = BashOperator(task_id="echo", bash_command="echo hi") # self.root_task.set_upstream(bash_op.task) friendly_ids = calculate_friendly_task_ids(tasks_to_run) completed_ids = tasks_to_ids_set(tasks_completed) task_skipped_as_not_required_ids = tasks_to_ids_set( task_skipped_as_not_required) skipped_ids = tasks_to_ids_set(tasks_skipped) task_runs = [] for task in tasks_to_run: with task.ctrl.task_context(phase=TaskContextPhase.BUILD): # we want to have configuration with task overrides task_engine = build_task_from_config( task_name=remote_engine.task_name) task_engine.require_submit = remote_engine.require_submit task_run = TaskRun( run=run, task=task, task_af_id=friendly_ids[task.task_id], task_engine=task_engine, _uuid=root_task_run_uid if task.task_id == root_task.task_id else None, ) if task.task_id in completed_ids: task_run.is_reused = True if task.task_id in task_skipped_as_not_required_ids: task_run.is_reused = True task_run.is_skipped_as_not_required = True if task.task_id in skipped_ids: task_run.is_skipped = True if task.task_id == root_task.task_id: task_run.is_root = True task_runs.append(task_run) return task_runs