def __init__(self, local_task_job): # Pass task instance context into log handlers to setup the logger. super().__init__(local_task_job.task_instance) self._task_instance = local_task_job.task_instance popen_prepend = [] if self._task_instance.run_as_user: self.run_as_user = self._task_instance.run_as_user else: try: self.run_as_user = conf.get('core', 'default_impersonation') except AirflowConfigException: self.run_as_user = None self._error_file = NamedTemporaryFile(delete=True) # Add sudo commands to change user if we need to. Needed to handle SubDagOperator # case using a SequentialExecutor. self.log.debug("Planning to run as the %s user", self.run_as_user) if self.run_as_user and (self.run_as_user != getuser()): # We want to include any environment variables now, as we won't # want to have to specify them in the sudo call - they would show # up in `ps` that way! And run commands now, as the other user # might not be able to run the cmds to get credentials cfg_path = tmp_configuration_copy(chmod=0o600, include_env=True, include_cmds=True) # Give ownership of file to user; only they can read and write subprocess.check_call( ['sudo', 'chown', self.run_as_user, cfg_path, self._error_file.name], close_fds=True ) # propagate PYTHONPATH environment variable pythonpath_value = os.environ.get(PYTHONPATH_VAR, '') popen_prepend = ['sudo', '-E', '-H', '-u', self.run_as_user] if pythonpath_value: popen_prepend.append(f'{PYTHONPATH_VAR}={pythonpath_value}') else: # Always provide a copy of the configuration file settings. Since # we are running as the same user, and can pass through environment # variables then we don't need to include those in the config copy # - the runner can read/execute those values as it needs cfg_path = tmp_configuration_copy(chmod=0o600, include_env=False, include_cmds=False) self._cfg_path = cfg_path self._command = ( popen_prepend + self._task_instance.command_as_list( raw=True, pickle_id=local_task_job.pickle_id, mark_success=local_task_job.mark_success, job_id=local_task_job.id, pool=local_task_job.pool, cfg_path=cfg_path, ) + ["--error-file", self._error_file.name] ) self.process = None
def __init__(self, local_task_job): """ :param local_task_job: The local task job associated with running the associated task instance. :type local_task_job: airflow.jobs.LocalTaskJob """ # Pass task instance context into log handlers to setup the logger. super(BaseTaskRunner, self).__init__(local_task_job.task_instance) self._task_instance = local_task_job.task_instance popen_prepend = [] if self._task_instance.run_as_user: self.run_as_user = self._task_instance.run_as_user else: try: self.run_as_user = conf.get('core', 'default_impersonation') except conf.AirflowConfigException: self.run_as_user = None # Always provide a copy of the configuration file settings cfg_path = tmp_configuration_copy() # The following command should always work since the user doing chmod is the same # as the one who just created the file. subprocess.call( ['chmod', '600', cfg_path], close_fds=True ) # Add sudo commands to change user if we need to. Needed to handle SubDagOperator # case using a SequentialExecutor. self.log.debug("Planning to run as the %s user", self.run_as_user) if self.run_as_user and (self.run_as_user != getpass.getuser()): # Give ownership of file to user; only they can read and write subprocess.call( ['sudo', 'chown', self.run_as_user, cfg_path], close_fds=True ) # propagate PYTHONPATH environment variable pythonpath_value = os.environ.get(PYTHONPATH_VAR, '') popen_prepend = ['sudo', '-E', '-H', '-u', self.run_as_user] if pythonpath_value: popen_prepend.append('{}={}'.format(PYTHONPATH_VAR, pythonpath_value)) self._cfg_path = cfg_path self._command = popen_prepend + self._task_instance.command_as_list( raw=True, pickle_id=local_task_job.pickle_id, mark_success=local_task_job.mark_success, job_id=local_task_job.id, pool=local_task_job.pool, cfg_path=cfg_path, ) self.process = None
def __init__(self, local_task_job): """ :param local_task_job: The local task job associated with running the associated task instance. :type local_task_job: airflow.jobs.LocalTaskJob """ # Pass task instance context into log handlers to setup the logger. super(BaseTaskRunner, self).__init__(local_task_job.task_instance) self._task_instance = local_task_job.task_instance popen_prepend = [] if self._task_instance.run_as_user: self.run_as_user = self._task_instance.run_as_user else: try: self.run_as_user = conf.get('core', 'default_impersonation') except conf.AirflowConfigException: self.run_as_user = None # Always provide a copy of the configuration file settings cfg_path = tmp_configuration_copy() # The following command should always work since the user doing chmod is the same # as the one who just created the file. subprocess.call( ['chmod', '600', cfg_path], close_fds=True ) # Add sudo commands to change user if we need to. Needed to handle SubDagOperator # case using a SequentialExecutor. self.log.debug("Planning to run as the %s user", self.run_as_user) if self.run_as_user and (self.run_as_user != getpass.getuser()): # Give ownership of file to user; only they can read and write subprocess.call( ['sudo', 'chown', self.run_as_user, cfg_path], close_fds=True ) # propagate PYTHONPATH environment variable pythonpath_value = os.environ.get(PYTHONPATH_VAR, '') popen_prepend = ['sudo', '-H', '-u', self.run_as_user] if pythonpath_value: popen_prepend.append('{}={}'.format(PYTHONPATH_VAR, pythonpath_value)) self._cfg_path = cfg_path self._command = popen_prepend + self._task_instance.command_as_list( raw=True, pickle_id=local_task_job.pickle_id, mark_success=local_task_job.mark_success, job_id=local_task_job.id, pool=local_task_job.pool, cfg_path=cfg_path, ) self.process = None
def _per_task_process(task, key, ti, session=None): ti.refresh_from_db() task = self.dag.get_task(ti.task_id) ti.task = task ignore_depends_on_past = (self.ignore_first_depends_on_past and ti.execution_date == (start_date or ti.start_date)) self.log.debug("Task instance to run %s state %s", ti, ti.state) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == State.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return elif ti.state == State.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime elif ti.state == State.NONE: self.log.warning( "FIXME: task instance {} state was set to None " "externally. This should not happen") ti.set_state(State.SCHEDULED, session=session) if self.rerun_failed_tasks: # Rerun failed tasks or upstreamed failed tasks if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance {ti} " "with state {state}".format( ti=ti, state=ti.state)) if key in ti_status.running: ti_status.running.pop(key) # Reset the failed task in backfill to scheduled state ti.set_state(State.SCHEDULED, session=session) else: # Default behaviour which works for subdag. if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance {ti} " "with {state} state".format( ti=ti, state=ti.state)) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return backfill_context = DepContext( deps=BACKFILL_QUEUED_DEPS, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True) ti.refresh_from_db(lock_for_update=True, session=session) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met(dep_context=backfill_context, session=session, verbose=self.verbose): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor " "waiting for queue to clear", ti) else: self.log.debug('Sending %s to executor', ti) # Skip scheduled state, we are executing immediately ti.state = State.QUEUED ti.queued_dttm = timezone.utcnow( ) if not ti.queued_dttm else ti.queued_dttm session.merge(ti) cfg_path = None if executor.__class__ in ( executors.LocalExecutor, executors.SequentialExecutor): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path) ti_status.running[key] = ti ti_status.to_run.pop(key) session.commit() return if ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # special case if ti.state == State.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # special case if ti.state == State.UP_FOR_RESCHEDULE: self.log.debug( "Task instance %s reschedule period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # all remaining tasks self.log.debug('Adding %s to not_ready', ti) ti_status.not_ready.add(key)
def _per_task_process(key, ti: TaskInstance, session=None): ti.refresh_from_db(lock_for_update=True, session=session) task = self.dag.get_task(ti.task_id, include_subdags=True) ti.task = task self.log.debug("Task instance to run %s state %s", ti, ti.state) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == TaskInstanceState.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return elif ti.state == TaskInstanceState.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime elif ti.state == State.NONE: self.log.warning( "FIXME: Task instance %s state was set to None externally. This should not happen", ti) ti.set_state(TaskInstanceState.SCHEDULED, session=session) if self.rerun_failed_tasks: # Rerun failed tasks or upstreamed failed tasks if ti.state in (TaskInstanceState.FAILED, TaskInstanceState.UPSTREAM_FAILED): self.log.error("Task instance %s with state %s", ti, ti.state) if key in ti_status.running: ti_status.running.pop(key) # Reset the failed task in backfill to scheduled state ti.set_state(TaskInstanceState.SCHEDULED, session=session) else: # Default behaviour which works for subdag. if ti.state in (TaskInstanceState.FAILED, TaskInstanceState.UPSTREAM_FAILED): self.log.error("Task instance %s with state %s", ti, ti.state) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return if self.ignore_first_depends_on_past: dagrun = ti.get_dagrun(session=session) ignore_depends_on_past = dagrun.execution_date == ( start_date or ti.start_date) else: ignore_depends_on_past = False backfill_context = DepContext( deps=BACKFILL_QUEUED_DEPS, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True, ) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met(dep_context=backfill_context, session=session, verbose=self.verbose): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor waiting for queue to clear", ti) else: self.log.debug('Sending %s to executor', ti) # Skip scheduled state, we are executing immediately ti.state = TaskInstanceState.QUEUED ti.queued_by_job_id = self.id ti.queued_dttm = timezone.utcnow() session.merge(ti) try: session.commit() except OperationalError: self.log.exception( "Failed to commit task state change due to operational error" ) session.rollback() # early exit so the outer loop can retry return cfg_path = None if self.executor_class in ( executor_constants.LOCAL_EXECUTOR, executor_constants.SEQUENTIAL_EXECUTOR, ): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path, ) ti_status.running[key] = ti ti_status.to_run.pop(key) return if ti.state == TaskInstanceState.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # special case if ti.state == TaskInstanceState.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # special case if ti.state == TaskInstanceState.UP_FOR_RESCHEDULE: self.log.debug( "Task instance %s reschedule period not expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # all remaining tasks self.log.debug('Adding %s to not_ready', ti) ti_status.not_ready.add(key)
def _process_dag_task_instances(self, ti_status, executor, pickle_id, session=None): """ Process a set of task instances from a set of dag runs. Special handling is done to account for different task instance states that could be present when running them in a backfill process. :param ti_status: the internal status of the job :type ti_status: DagRunJob._DagRunTaskStatus :param executor: the executor to run the task instances :type executor: BaseExecutor :param pickle_id: the pickle_id if dag is pickled, None otherwise :type pickle_id: int :param start_date: the start date of the backfill job :type start_date: datetime :param session: the current session object :type session: Session :return: the list of execution_dates for the finished dag runs :rtype: list """ executed_run_dates = [] # values() returns a view so we copy to maintain a full list of the TIs to run all_ti = list(ti_status.to_run.values()) waiting_for_executor_result = {} while (len(ti_status.to_run) > 0 or len(ti_status.running) > 0) and len( ti_status.deadlocked ) == 0: if current.is_killed(): raise friendly_error.task_execution.databand_context_killed( "SingleDagRunJob scheduling main loop" ) self.log.debug("*** Clearing out not_ready list ***") ti_status.not_ready.clear() self.ti_state_manager.refresh_task_instances_state( all_ti, self.dag.dag_id, self.execution_date, session=session ) # we need to execute the tasks bottom to top # or leaf to root, as otherwise tasks might be # determined deadlocked while they are actually # waiting for their upstream to finish for task in self.dag.topological_sort(): # TODO: too complicated mechanism, # it's not possible that we have multiple tasks with the same id in to run for key, ti in list(ti_status.to_run.items()): if task.task_id != ti.task_id: continue if not self._optimize: ti.refresh_from_db() task = self.dag.get_task(ti.task_id) ti.task = task # TODO : do we need that? # ignore_depends_on_past = ( # self.ignore_first_depends_on_past and # ti.execution_date == (start_date or ti.start_date)) ignore_depends_on_past = False self.log.debug("Task instance to run %s state %s", ti, ti.state) # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime if ti.state == State.NONE: self.log.warning( "FIXME: task instance {} state was set to None " "externally. This should not happen" ) ti.set_state(State.SCHEDULED, session=session) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == State.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue elif ti.state == State.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue elif ti.state == State.FAILED: self.log.error("Task instance %s failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue elif ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue runtime_deps = [] if self.airflow_config.disable_dag_concurrency_rules: # RUN Deps validate dag and task concurrency # It's less relevant when we run in stand along mode with SingleDagRunJob # from airflow.ti_deps.deps.runnable_exec_date_dep import RunnableExecDateDep from airflow.ti_deps.deps.valid_state_dep import ValidStateDep # from airflow.ti_deps.deps.dag_ti_slots_available_dep import DagTISlotsAvailableDep # from airflow.ti_deps.deps.task_concurrency_dep import TaskConcurrencyDep # from airflow.ti_deps.deps.pool_slots_available_dep import PoolSlotsAvailableDep runtime_deps = { # RunnableExecDateDep(), ValidStateDep(SCHEDUALED_OR_RUNNABLE), # DagTISlotsAvailableDep(), # TaskConcurrencyDep(), # PoolSlotsAvailableDep(), } else: runtime_deps = RUNNING_DEPS dagrun_dep_context = DepContext( deps=runtime_deps, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True, ) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met( dep_context=dagrun_dep_context, session=session, verbose=self.verbose, ): ti.refresh_from_db(lock_for_update=True, session=session) if ( ti.state == State.SCHEDULED or ti.state == State.UP_FOR_RETRY ): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor " "waiting for queue to clear", ti, ) else: self.log.debug("Sending %s to executor", ti) # if ti.state == State.UP_FOR_RETRY: # ti._try_number += 1 # Skip scheduled state, we are executing immediately ti.state = State.QUEUED session.merge(ti) cfg_path = None if executor.__class__ in ( executors.LocalExecutor, executors.SequentialExecutor, ): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path, ) ti_status.to_run.pop(key) ti_status.running[key] = ti waiting_for_executor_result[key] = ti session.commit() continue if ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) continue # special case if ti.state == State.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not " "expired yet", ti ) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti continue # all remaining tasks self.log.debug("Adding %s to not_ready", ti) ti_status.not_ready.add(key) # execute the tasks in the queue self.heartbeat() executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run and there are no running tasks then the backfill # is deadlocked if ( ti_status.not_ready and ti_status.not_ready == set(ti_status.to_run) and len(ti_status.running) == 0 ): self.log.warning( "scheduler: Deadlock discovered for ti_status.to_run=%s", ti_status.to_run.values(), ) ti_status.deadlocked.update(ti_status.to_run.values()) ti_status.to_run.clear() self.ti_state_manager.refresh_task_instances_state( all_ti, self.dag.dag_id, self.execution_date, session=session ) # check executor state self._manage_executor_state(ti_status.running, waiting_for_executor_result) if self._zombie_cleaner: # this code exists in airflow original scheduler # clean zombies ( we don't need multiple runs here actually self._zombie_cleaner.find_and_clean_dag_zombies( dag=self.dag, execution_date=self.execution_date ) # update the task counters self._update_counters(ti_status, waiting_for_executor_result) # update dag run state _dag_runs = ti_status.active_runs[:] for run in _dag_runs: run.update_state(session=session) self._update_databand_task_run_states(run) if run.state in State.finished(): ti_status.finished_runs += 1 ti_status.active_runs.remove(run) executed_run_dates.append(run.execution_date) self._log_progress(ti_status) if self.fail_fast and ti_status.failed: msg = ",".join([t[1] for t in ti_status.failed]) logger.error( "scheduler: Terminating executor because a task failed and fail_fast mode is enabled %s", msg, ) raise DatabandFailFastError( "Failing whole pipeline as it has failed/canceled tasks %s" % msg, ) # return updated status return executed_run_dates