Beispiel #1
0
    def _check_and_cancel(self, reason_descr):
        """Check if blocking reason ``reason_descr`` is unrecoverable and cancel the
        job in this case."""

        # The reason description may have two parts as follows:
        # "ReqNodeNotAvail, UnavailableNodes:nid00[408,411-415]"
        try:
            reason, reason_details = reason_descr.split(',', maxsplit=1)
        except ValueError:
            # no reason details
            reason, reason_details = reason_descr, None

        if reason in self._cancel_reasons:
            self.cancel()
            reason_msg = ('job cancelled because it was blocked due to '
                          'a perhaps non-recoverable reason: ' + reason)
            if reason_details is not None:
                reason_msg += ', ' + reason_details

            raise JobBlockedError(reason_msg, jobid=self._jobid)
Beispiel #2
0
    def _do_cancel_if_blocked(self, job, reason_descr):
        '''Check if blocking reason ``reason_descr`` is unrecoverable and
        cancel the job in this case.'''

        # The reason description may have two parts as follows:
        # "ReqNodeNotAvail, UnavailableNodes:nid00[408,411-415]"
        try:
            reason, reason_details = reason_descr.split(',', maxsplit=1)
        except ValueError:
            # no reason details
            reason, reason_details = reason_descr, None

        if reason in self._cancel_reasons:
            if reason == 'ReqNodeNotAvail' and reason_details:
                self.log('Job blocked due to ReqNodeNotAvail')
                node_match = re.match(r'UnavailableNodes:(?P<node_names>\S+)?',
                                      reason_details.strip())
                if node_match:
                    node_names = node_match['node_names']
                    if node_names:
                        # Retrieve the info of the unavailable nodes
                        # and check if they are indeed down
                        self.log(f'Checking if nodes {node_names!r} '
                                 f'are indeed unavailable')
                        nodes = self._get_nodes_by_name(node_names)
                        if not any(n.is_down() for n in nodes):
                            return
                    else:
                        # List of unavailable nodes is empty; assume job
                        # is pending
                        return

            self.cancel(job)
            reason_msg = ('job cancelled because it was blocked due to '
                          'a perhaps non-recoverable reason: ' + reason)
            if reason_details is not None:
                reason_msg += ', ' + reason_details

            job._exception = JobBlockedError(reason_msg, job.jobid)