def _execute(self): self.task_runner = get_task_runner(self) def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.on_kill() raise AirflowException("LocalTaskJob received SIGTERM signal") signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance._check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool): self.log.info("Task is not able to be run") return try: self.task_runner.start() heartbeat_time_limit = conf.getint( 'scheduler', 'scheduler_zombie_task_threshold') while True: # Monitor the task to see if it's done return_code = self.task_runner.return_code() if return_code is not None: self.log.info("Task exited with return code %s", return_code) return self.heartbeat() # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. # This can only really happen if the worker can't read the DB for a long time time_since_last_heartbeat = ( timezone.utcnow() - self.latest_heartbeat).total_seconds() if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limited exceeded!") raise AirflowException( "Time since last heartbeat({:.2f}s) " "exceeded limit ({}s).".format( time_since_last_heartbeat, heartbeat_time_limit)) if time_since_last_heartbeat < self.heartrate: sleep_for = self.heartrate - time_since_last_heartbeat self.log.debug( "Time since last heartbeat(%.2f s) < heartrate(%s s)" ", sleeping for %s s", time_since_last_heartbeat, self.heartrate, sleep_for) time.sleep(sleep_for) finally: self.on_kill()
def test_should_support_core_task_runner(self, mock_subprocess): local_task_job = mock.MagicMock( **{'task_instance.get_template_context.return_value': {"ti": mock.MagicMock()}} ) task_runner = get_task_runner(local_task_job) assert "StandardTaskRunner" == task_runner.__class__.__name__
def test_should_support_custom_task_runner(self): local_task_job = mock.MagicMock( **{'task_instance.get_template_context.return_value': {"ti": mock.MagicMock()}} ) custom_task_runner.reset_mock() task_runner = get_task_runner(local_task_job) custom_task_runner.assert_called_once_with(local_task_job) assert custom_task_runner.return_value == task_runner
def _execute(self): self.task_runner = get_task_runner(self) def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.on_kill() self.task_instance.refresh_from_db() if self.task_instance.state not in State.finished: self.task_instance.set_state(State.FAILED) self.task_instance._run_finished_callback( error="task received sigterm") raise AirflowException("LocalTaskJob received SIGTERM signal") signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance.check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool, ): self.log.info("Task is not able to be run") return try: self.task_runner.start() heartbeat_time_limit = conf.getint( 'scheduler', 'scheduler_zombie_task_threshold') # task callback invocation happens either here or in # self.heartbeat() instead of taskinstance._run_raw_task to # avoid race conditions # # When self.terminating is set to True by heartbeat_callback, this # loop should not be restarted. Otherwise self.handle_task_exit # will be invoked and we will end up with duplicated callbacks while not self.terminating: # Monitor the task to see if it's done. Wait in a syscall # (`os.wait`) for as long as possible so we notice the # subprocess finishing as quick as we can max_wait_time = max( 0, # Make sure this value is never negative, min( (heartbeat_time_limit - (timezone.utcnow() - self.latest_heartbeat).total_seconds() * 0.75), self.heartrate, ), ) return_code = self.task_runner.return_code( timeout=max_wait_time) if return_code is not None: self.handle_task_exit(return_code) return self.heartbeat() # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. # This can only really happen if the worker can't read the DB for a long time time_since_last_heartbeat = ( timezone.utcnow() - self.latest_heartbeat).total_seconds() if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limit exceeded!") raise AirflowException( "Time since last heartbeat({:.2f}s) " "exceeded limit ({}s).".format( time_since_last_heartbeat, heartbeat_time_limit)) finally: self.on_kill()
def _execute(self): self.task_runner = get_task_runner(self) # pylint: disable=unused-argument def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.on_kill() raise AirflowException("LocalTaskJob received SIGTERM signal") # pylint: enable=unused-argument signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance.check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool, ): self.log.info("Task is not able to be run") return try: self.task_runner.start() heartbeat_time_limit = conf.getint( 'scheduler', 'scheduler_zombie_task_threshold') while True: # Monitor the task to see if it's done. Wait in a syscall # (`os.wait`) for as long as possible so we notice the # subprocess finishing as quick as we can max_wait_time = max( 0, # Make sure this value is never negative, min( (heartbeat_time_limit - (timezone.utcnow() - self.latest_heartbeat).total_seconds() * 0.75), self.heartrate, ), ) return_code = self.task_runner.return_code( timeout=max_wait_time) if return_code is not None: self.log.info("Task exited with return code %s", return_code) return self.heartbeat() # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. # This can only really happen if the worker can't read the DB for a long time time_since_last_heartbeat = ( timezone.utcnow() - self.latest_heartbeat).total_seconds() if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limit exceeded!") raise AirflowException( "Time since last heartbeat({:.2f}s) " "exceeded limit ({}s).".format( time_since_last_heartbeat, heartbeat_time_limit)) finally: self.on_kill()
def _execute(self): self._enable_task_listeners() self.task_runner = get_task_runner(self) def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.task_runner.terminate() self.handle_task_exit(128 + signum) signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance.check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool, external_executor_id=self.external_executor_id, ): self.log.info("Task is not able to be run") return try: self.task_runner.start() heartbeat_time_limit = conf.getint( 'scheduler', 'scheduler_zombie_task_threshold') # LocalTaskJob should not run callbacks, which are handled by TaskInstance._run_raw_task # 1, LocalTaskJob does not parse DAG, thus cannot run callbacks # 2, The run_as_user of LocalTaskJob is likely not same as the TaskInstance._run_raw_task. # When run_as_user is specified, the process owner of the LocalTaskJob must be sudoable. # It is not secure to run callbacks with sudoable users. # If _run_raw_task receives SIGKILL, scheduler will mark it as zombie and invoke callbacks # If LocalTaskJob receives SIGTERM, LocalTaskJob passes SIGTERM to _run_raw_task # If the state of task_instance is changed, LocalTaskJob sends SIGTERM to _run_raw_task while not self.terminating: # Monitor the task to see if it's done. Wait in a syscall # (`os.wait`) for as long as possible so we notice the # subprocess finishing as quick as we can max_wait_time = max( 0, # Make sure this value is never negative, min( (heartbeat_time_limit - (timezone.utcnow() - self.latest_heartbeat).total_seconds() * 0.75), self.heartrate, ), ) return_code = self.task_runner.return_code( timeout=max_wait_time) if return_code is not None: self.handle_task_exit(return_code) return self.heartbeat() # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. # This can only really happen if the worker can't read the DB for a long time time_since_last_heartbeat = ( timezone.utcnow() - self.latest_heartbeat).total_seconds() if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limit exceeded!") raise AirflowException( f"Time since last heartbeat({time_since_last_heartbeat:.2f}s) exceeded limit " f"({heartbeat_time_limit}s).") finally: self.on_kill()
def _execute(self): self.task_runner = get_task_runner(self) def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.on_kill() raise AirflowException("LocalTaskJob received SIGTERM signal") signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance._check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool): self.log.info("Task is not able to be run") return try: self.task_runner.start() last_heartbeat_time = time.time() heartbeat_time_limit = conf.getint( 'scheduler', 'scheduler_zombie_task_threshold') while True: # Monitor the task to see if it's done return_code = self.task_runner.return_code() if return_code is not None: self.log.info("Task exited with return code %s", return_code) return # Periodically heartbeat so that the scheduler doesn't think this # is a zombie try: self.heartbeat() last_heartbeat_time = time.time() except OperationalError: Stats.incr('local_task_job_heartbeat_failure', 1, 1) self.log.exception( "Exception while trying to heartbeat! Sleeping for %s seconds", self.heartrate) time.sleep(self.heartrate) # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. time_since_last_heartbeat = time.time() - last_heartbeat_time if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limited exceeded!") raise AirflowException( "Time since last heartbeat({:.2f}s) " "exceeded limit ({}s).".format( time_since_last_heartbeat, heartbeat_time_limit)) finally: self.on_kill()