Esempio n. 1
0
 def _process_message_failed(self, itask, event_time, message):
     """Helper for process_message, handle a failed message."""
     if event_time is None:
         event_time = get_current_time_string()
     itask.set_summary_time('finished', event_time)
     job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
     self.job_pool.set_job_time(job_d, 'finished', event_time)
     self.suite_db_mgr.put_update_task_jobs(itask, {
         "run_status": 1,
         "time_run_exit": event_time,
     })
     if (TASK_STATUS_RETRYING not in itask.try_timers
             or itask.try_timers[TASK_STATUS_RETRYING].next() is None):
         # No retry lined up: definitive failure.
         self.pflag = True
         if itask.state.reset(TASK_STATUS_FAILED):
             self.setup_event_handlers(itask, "failed", message)
             self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED)
         LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num,
                      "failed")
     elif itask.state.reset(TASK_STATUS_RETRYING):
         delay_msg = "retrying in %s" % (
             itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
         if itask.state.is_held:
             delay_msg = "held (%s)" % delay_msg
         msg = "failed, %s" % (delay_msg)
         LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
         itask.set_summary_message(msg)
         self.setup_event_handlers(itask, "retry",
                                   "%s, %s" % (self.JOB_FAILED, delay_msg))
     self._reset_job_timers(itask)
Esempio n. 2
0
def _can_auto_restart():
    """Determine whether this workflow can safely auto stop-restart."""
    # Check whether there is currently an available host to restart on.
    try:
        select_workflow_host(cached=False)
    except HostSelectException:
        LOG.critical('Workflow cannot automatically restart because:\n' +
                     'No alternative host to restart workflow on.')
        return False
    except Exception:
        # Any unexpected error in host selection shouldn't be able to take
        # down the workflow.
        LOG.critical('Workflow cannot automatically restart because:\n' +
                     'Error in host selection:\n' + traceback.format_exc())
        return False
    else:
        return True
Esempio n. 3
0
    def _process_message_failed(self, itask, event_time, message):
        """Helper for process_message, handle a failed message.

        Return True if no retries (hence go to the failed state).
        """
        no_retries = False
        if event_time is None:
            event_time = get_current_time_string()
        itask.set_summary_time('finished', event_time)
        job_d = get_task_job_id(
            itask.point, itask.tdef.name, itask.submit_num)
        self.job_pool.set_job_time(job_d, 'finished', event_time)
        self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "run_status": 1,
            "time_run_exit": event_time,
        })
        self.pflag = True
        if (
                TimerFlags.EXECUTION_RETRY not in itask.try_timers
                or itask.try_timers[TimerFlags.EXECUTION_RETRY].next() is None
        ):
            # No retry lined up: definitive failure.
            if itask.state.reset(TASK_STATUS_FAILED):
                self.setup_event_handlers(itask, self.EVENT_FAILED, message)
            LOG.critical(
                "[%s] -job(%02d) %s", itask, itask.submit_num, "failed")
            no_retries = True
        else:
            # There is an execution retry lined up.
            timer = itask.try_timers[TimerFlags.EXECUTION_RETRY]
            self._retry_task(itask, timer.timeout)
            delay_msg = f"retrying in {timer.delay_timeout_as_str()}"
            if itask.state.is_held:
                delay_msg = "held (%s)" % delay_msg
            msg = "failed, %s" % (delay_msg)
            LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg)
            itask.set_summary_message(msg)
            self.setup_event_handlers(
                itask, self.EVENT_RETRY, f"{self.JOB_FAILED}, {delay_msg}")
        self._reset_job_timers(itask)
        return no_retries
Esempio n. 4
0
def _set_auto_restart(scheduler,
                      restart_delay=None,
                      mode=AutoRestartMode.RESTART_NORMAL):
    """Configure the workflow to automatically stop and restart.

    Restart handled by `workflow_auto_restart`.

    Args:
        scheduler (cylc.flow.scheduler.Scheduler):
            Scheduler instance of the running workflow.
        restart_delay (cylc.flow.parsec.DurationFloat):
            Workflow will wait a random period between 0 and
            `restart_delay` seconds before attempting to stop/restart in
            order to avoid multiple workflows restarting simultaneously.
        mode (str): Auto stop-restart mode.

    Return:
        bool: False if it is not possible to automatically stop/restart
        the workflow due to its configuration/runtime state.
    """
    # Check that the workflow isn't already shutting down.
    if scheduler.stop_mode:
        return True

    # Force mode, stop the workflow now, don't restart it.
    if mode == AutoRestartMode.FORCE_STOP:
        LOG.critical('This workflow will be shutdown as the workflow '
                     'host is unable to continue running it.\n'
                     'When another workflow host becomes available '
                     'the workflow can be restarted by:\n'
                     f'    $ cylc play {scheduler.workflow}')
        if scheduler.auto_restart_time:
            LOG.info('Scheduled automatic restart canceled')
        scheduler.auto_restart_time = time()
        scheduler.auto_restart_mode = mode
        return True

    # Check workflow isn't already scheduled to auto-stop.
    if scheduler.auto_restart_time is not None:
        return True

    # Workflow host is condemned and workflow running in no detach mode.
    # Raise an error to cause the workflow to abort.
    # This should raise an "abort" event and return a non-zero code to the
    # caller still attached to the workflow process.
    if scheduler.options.no_detach:
        raise RuntimeError('Workflow host condemned in no detach mode')

    # Check workflow is able to be safely restarted.
    if not _can_auto_restart():
        return False

    LOG.info('Workflow will automatically restart on a new host.')
    if restart_delay is not None and restart_delay != 0:
        if restart_delay > 0:
            # Delay shutdown by a random interval to avoid many
            # workflows restarting simultaneously.
            shutdown_delay = int(random() * restart_delay)  # nosec
        else:
            # Un-documented feature, schedule exact restart interval for
            # testing purposes.
            shutdown_delay = abs(int(restart_delay))
        shutdown_time = time() + shutdown_delay
        LOG.info('Workflow will restart in %ss (at %s)', shutdown_delay,
                 time2str(shutdown_time))
        scheduler.auto_restart_time = shutdown_time
    else:
        scheduler.auto_restart_time = time()

    scheduler.auto_restart_mode = AutoRestartMode.RESTART_NORMAL

    return True