コード例 #1
0
    def _execute(self):
        self.task_runner = get_task_runner(self)

        def signal_handler(signum, frame):
            """Setting kill signal handler"""
            self.log.error("Received SIGTERM. Terminating subprocesses")
            self.on_kill()
            raise AirflowException("LocalTaskJob received SIGTERM signal")

        signal.signal(signal.SIGTERM, signal_handler)

        if not self.task_instance._check_and_change_state_before_execution(
                mark_success=self.mark_success,
                ignore_all_deps=self.ignore_all_deps,
                ignore_depends_on_past=self.ignore_depends_on_past,
                ignore_task_deps=self.ignore_task_deps,
                ignore_ti_state=self.ignore_ti_state,
                job_id=self.id,
                pool=self.pool):
            self.log.info("Task is not able to be run")
            return

        try:
            self.task_runner.start()

            heartbeat_time_limit = conf.getint(
                'scheduler', 'scheduler_zombie_task_threshold')
            while True:
                # Monitor the task to see if it's done
                return_code = self.task_runner.return_code()
                if return_code is not None:
                    self.log.info("Task exited with return code %s",
                                  return_code)
                    return

                self.heartbeat()

                # If it's been too long since we've heartbeat, then it's possible that
                # the scheduler rescheduled this task, so kill launched processes.
                # This can only really happen if the worker can't read the DB for a long time
                time_since_last_heartbeat = (
                    timezone.utcnow() - self.latest_heartbeat).total_seconds()
                if time_since_last_heartbeat > heartbeat_time_limit:
                    Stats.incr('local_task_job_prolonged_heartbeat_failure', 1,
                               1)
                    self.log.error("Heartbeat time limited exceeded!")
                    raise AirflowException(
                        "Time since last heartbeat({:.2f}s) "
                        "exceeded limit ({}s).".format(
                            time_since_last_heartbeat, heartbeat_time_limit))

                if time_since_last_heartbeat < self.heartrate:
                    sleep_for = self.heartrate - time_since_last_heartbeat
                    self.log.debug(
                        "Time since last heartbeat(%.2f s) < heartrate(%s s)"
                        ", sleeping for %s s", time_since_last_heartbeat,
                        self.heartrate, sleep_for)
                    time.sleep(sleep_for)
        finally:
            self.on_kill()
コード例 #2
0
ファイル: test_task_runner.py プロジェクト: ysktir/airflow-1
    def test_should_support_core_task_runner(self, mock_subprocess):
        local_task_job = mock.MagicMock(
            **{'task_instance.get_template_context.return_value': {"ti": mock.MagicMock()}}
        )
        task_runner = get_task_runner(local_task_job)

        assert "StandardTaskRunner" == task_runner.__class__.__name__
コード例 #3
0
ファイル: test_task_runner.py プロジェクト: ysktir/airflow-1
    def test_should_support_custom_task_runner(self):
        local_task_job = mock.MagicMock(
            **{'task_instance.get_template_context.return_value': {"ti": mock.MagicMock()}}
        )
        custom_task_runner.reset_mock()

        task_runner = get_task_runner(local_task_job)

        custom_task_runner.assert_called_once_with(local_task_job)

        assert custom_task_runner.return_value == task_runner
コード例 #4
0
ファイル: local_task_job.py プロジェクト: razius/airflow
    def _execute(self):
        self.task_runner = get_task_runner(self)

        def signal_handler(signum, frame):
            """Setting kill signal handler"""
            self.log.error("Received SIGTERM. Terminating subprocesses")
            self.on_kill()
            self.task_instance.refresh_from_db()
            if self.task_instance.state not in State.finished:
                self.task_instance.set_state(State.FAILED)
            self.task_instance._run_finished_callback(
                error="task received sigterm")
            raise AirflowException("LocalTaskJob received SIGTERM signal")

        signal.signal(signal.SIGTERM, signal_handler)

        if not self.task_instance.check_and_change_state_before_execution(
                mark_success=self.mark_success,
                ignore_all_deps=self.ignore_all_deps,
                ignore_depends_on_past=self.ignore_depends_on_past,
                ignore_task_deps=self.ignore_task_deps,
                ignore_ti_state=self.ignore_ti_state,
                job_id=self.id,
                pool=self.pool,
        ):
            self.log.info("Task is not able to be run")
            return

        try:
            self.task_runner.start()

            heartbeat_time_limit = conf.getint(
                'scheduler', 'scheduler_zombie_task_threshold')

            # task callback invocation happens either here or in
            # self.heartbeat() instead of taskinstance._run_raw_task to
            # avoid race conditions
            #
            # When self.terminating is set to True by heartbeat_callback, this
            # loop should not be restarted. Otherwise self.handle_task_exit
            # will be invoked and we will end up with duplicated callbacks
            while not self.terminating:
                # Monitor the task to see if it's done. Wait in a syscall
                # (`os.wait`) for as long as possible so we notice the
                # subprocess finishing as quick as we can
                max_wait_time = max(
                    0,  # Make sure this value is never negative,
                    min(
                        (heartbeat_time_limit -
                         (timezone.utcnow() -
                          self.latest_heartbeat).total_seconds() * 0.75),
                        self.heartrate,
                    ),
                )

                return_code = self.task_runner.return_code(
                    timeout=max_wait_time)
                if return_code is not None:
                    self.handle_task_exit(return_code)
                    return

                self.heartbeat()

                # If it's been too long since we've heartbeat, then it's possible that
                # the scheduler rescheduled this task, so kill launched processes.
                # This can only really happen if the worker can't read the DB for a long time
                time_since_last_heartbeat = (
                    timezone.utcnow() - self.latest_heartbeat).total_seconds()
                if time_since_last_heartbeat > heartbeat_time_limit:
                    Stats.incr('local_task_job_prolonged_heartbeat_failure', 1,
                               1)
                    self.log.error("Heartbeat time limit exceeded!")
                    raise AirflowException(
                        "Time since last heartbeat({:.2f}s) "
                        "exceeded limit ({}s).".format(
                            time_since_last_heartbeat, heartbeat_time_limit))
        finally:
            self.on_kill()
コード例 #5
0
    def _execute(self):
        self.task_runner = get_task_runner(self)

        # pylint: disable=unused-argument
        def signal_handler(signum, frame):
            """Setting kill signal handler"""
            self.log.error("Received SIGTERM. Terminating subprocesses")
            self.on_kill()
            raise AirflowException("LocalTaskJob received SIGTERM signal")

        # pylint: enable=unused-argument
        signal.signal(signal.SIGTERM, signal_handler)

        if not self.task_instance.check_and_change_state_before_execution(
                mark_success=self.mark_success,
                ignore_all_deps=self.ignore_all_deps,
                ignore_depends_on_past=self.ignore_depends_on_past,
                ignore_task_deps=self.ignore_task_deps,
                ignore_ti_state=self.ignore_ti_state,
                job_id=self.id,
                pool=self.pool,
        ):
            self.log.info("Task is not able to be run")
            return

        try:
            self.task_runner.start()

            heartbeat_time_limit = conf.getint(
                'scheduler', 'scheduler_zombie_task_threshold')

            while True:
                # Monitor the task to see if it's done. Wait in a syscall
                # (`os.wait`) for as long as possible so we notice the
                # subprocess finishing as quick as we can
                max_wait_time = max(
                    0,  # Make sure this value is never negative,
                    min(
                        (heartbeat_time_limit -
                         (timezone.utcnow() -
                          self.latest_heartbeat).total_seconds() * 0.75),
                        self.heartrate,
                    ),
                )

                return_code = self.task_runner.return_code(
                    timeout=max_wait_time)
                if return_code is not None:
                    self.log.info("Task exited with return code %s",
                                  return_code)
                    return

                self.heartbeat()

                # If it's been too long since we've heartbeat, then it's possible that
                # the scheduler rescheduled this task, so kill launched processes.
                # This can only really happen if the worker can't read the DB for a long time
                time_since_last_heartbeat = (
                    timezone.utcnow() - self.latest_heartbeat).total_seconds()
                if time_since_last_heartbeat > heartbeat_time_limit:
                    Stats.incr('local_task_job_prolonged_heartbeat_failure', 1,
                               1)
                    self.log.error("Heartbeat time limit exceeded!")
                    raise AirflowException(
                        "Time since last heartbeat({:.2f}s) "
                        "exceeded limit ({}s).".format(
                            time_since_last_heartbeat, heartbeat_time_limit))
        finally:
            self.on_kill()
コード例 #6
0
    def _execute(self):
        self._enable_task_listeners()
        self.task_runner = get_task_runner(self)

        def signal_handler(signum, frame):
            """Setting kill signal handler"""
            self.log.error("Received SIGTERM. Terminating subprocesses")
            self.task_runner.terminate()
            self.handle_task_exit(128 + signum)

        signal.signal(signal.SIGTERM, signal_handler)

        if not self.task_instance.check_and_change_state_before_execution(
                mark_success=self.mark_success,
                ignore_all_deps=self.ignore_all_deps,
                ignore_depends_on_past=self.ignore_depends_on_past,
                ignore_task_deps=self.ignore_task_deps,
                ignore_ti_state=self.ignore_ti_state,
                job_id=self.id,
                pool=self.pool,
                external_executor_id=self.external_executor_id,
        ):
            self.log.info("Task is not able to be run")
            return

        try:
            self.task_runner.start()

            heartbeat_time_limit = conf.getint(
                'scheduler', 'scheduler_zombie_task_threshold')

            # LocalTaskJob should not run callbacks, which are handled by TaskInstance._run_raw_task
            # 1, LocalTaskJob does not parse DAG, thus cannot run callbacks
            # 2, The run_as_user of LocalTaskJob is likely not same as the TaskInstance._run_raw_task.
            # When run_as_user is specified, the process owner of the LocalTaskJob must be sudoable.
            # It is not secure to run callbacks with sudoable users.

            # If _run_raw_task receives SIGKILL, scheduler will mark it as zombie and invoke callbacks
            # If LocalTaskJob receives SIGTERM, LocalTaskJob passes SIGTERM to _run_raw_task
            # If the state of task_instance is changed, LocalTaskJob sends SIGTERM to _run_raw_task
            while not self.terminating:
                # Monitor the task to see if it's done. Wait in a syscall
                # (`os.wait`) for as long as possible so we notice the
                # subprocess finishing as quick as we can
                max_wait_time = max(
                    0,  # Make sure this value is never negative,
                    min(
                        (heartbeat_time_limit -
                         (timezone.utcnow() -
                          self.latest_heartbeat).total_seconds() * 0.75),
                        self.heartrate,
                    ),
                )

                return_code = self.task_runner.return_code(
                    timeout=max_wait_time)
                if return_code is not None:
                    self.handle_task_exit(return_code)
                    return

                self.heartbeat()

                # If it's been too long since we've heartbeat, then it's possible that
                # the scheduler rescheduled this task, so kill launched processes.
                # This can only really happen if the worker can't read the DB for a long time
                time_since_last_heartbeat = (
                    timezone.utcnow() - self.latest_heartbeat).total_seconds()
                if time_since_last_heartbeat > heartbeat_time_limit:
                    Stats.incr('local_task_job_prolonged_heartbeat_failure', 1,
                               1)
                    self.log.error("Heartbeat time limit exceeded!")
                    raise AirflowException(
                        f"Time since last heartbeat({time_since_last_heartbeat:.2f}s) exceeded limit "
                        f"({heartbeat_time_limit}s).")
        finally:
            self.on_kill()
コード例 #7
0
    def _execute(self):
        self.task_runner = get_task_runner(self)

        def signal_handler(signum, frame):
            """Setting kill signal handler"""
            self.log.error("Received SIGTERM. Terminating subprocesses")
            self.on_kill()
            raise AirflowException("LocalTaskJob received SIGTERM signal")

        signal.signal(signal.SIGTERM, signal_handler)

        if not self.task_instance._check_and_change_state_before_execution(
                mark_success=self.mark_success,
                ignore_all_deps=self.ignore_all_deps,
                ignore_depends_on_past=self.ignore_depends_on_past,
                ignore_task_deps=self.ignore_task_deps,
                ignore_ti_state=self.ignore_ti_state,
                job_id=self.id,
                pool=self.pool):
            self.log.info("Task is not able to be run")
            return

        try:
            self.task_runner.start()

            last_heartbeat_time = time.time()
            heartbeat_time_limit = conf.getint(
                'scheduler', 'scheduler_zombie_task_threshold')
            while True:
                # Monitor the task to see if it's done
                return_code = self.task_runner.return_code()
                if return_code is not None:
                    self.log.info("Task exited with return code %s",
                                  return_code)
                    return

                # Periodically heartbeat so that the scheduler doesn't think this
                # is a zombie
                try:
                    self.heartbeat()
                    last_heartbeat_time = time.time()
                except OperationalError:
                    Stats.incr('local_task_job_heartbeat_failure', 1, 1)
                    self.log.exception(
                        "Exception while trying to heartbeat! Sleeping for %s seconds",
                        self.heartrate)
                    time.sleep(self.heartrate)

                # If it's been too long since we've heartbeat, then it's possible that
                # the scheduler rescheduled this task, so kill launched processes.
                time_since_last_heartbeat = time.time() - last_heartbeat_time
                if time_since_last_heartbeat > heartbeat_time_limit:
                    Stats.incr('local_task_job_prolonged_heartbeat_failure', 1,
                               1)
                    self.log.error("Heartbeat time limited exceeded!")
                    raise AirflowException(
                        "Time since last heartbeat({:.2f}s) "
                        "exceeded limit ({}s).".format(
                            time_since_last_heartbeat, heartbeat_time_limit))
        finally:
            self.on_kill()