Esempio n. 1
0
File: Fsck.py Progetto: thiell/shine
    def ev_close(self, worker):
        """
        Check process termination status and generate appropriate events.

        Note that if fsck has correctly fixed some errors, actions will be
        considered as successful.
        """

        if worker.did_timeout():
            return FSAction.ev_close(self, worker)

        # We want to skip FSAction.ev_close(), just call the upper layer.
        Action.ev_close(self, worker)

        self.comp.lustre_check()

        # fsck returns 0=NOERROR, 1=OK_BUT_CORRECTION, 2=OK_BUT_REBOOT.
        # see man fsck.
        if worker.retcode() in (0, 1, 2, 4):
            # action succeeded
            result = Result(duration=self.duration, retcode=worker.retcode())
            if worker.retcode() in (1, 2):
                result.message = "Errors corrected"
            if worker.retcode() == 4:  # -n
                result.message = "Errors found but NOT corrected"
            self.comp.action_event(self, 'done', result)
            self.set_status(ACT_OK)
        else:
            # action failed
            msg = "\n".join(self._output)
            result = ErrorResult(msg, self.duration, worker.retcode())
            self.comp.action_event(self, 'failed', result)
            self.set_status(ACT_ERROR)
Esempio n. 2
0
    def ev_read(self, worker):
        node = worker.current_node
        buf = worker.current_msg
        try:
            data = shine_msg_unpack(buf)

            # COMPAT: Prior to 1.4, 'comp'+'action' was used.
            # 1.4+ uses ActionInfo
            if 'comp' in data:
                action = Action()
                action.NAME = data.pop('action')
                comp = data.pop('comp')
                comp.fs = self.fs
                desc = "%s of %s" % (action.NAME, comp.longtext())
                data['info'] = ActionInfo(action, comp, desc)
                evtype = 'comp'
            else:
                evtype = data.pop('evtype')

            self.fs.distant_event(evtype, node=node, **data)
        except ProxyActionUnpickleError, exp:
            # Maintain a standalone list of unpickling errors.
            # Node could have unpickling error but still exit with 0
            msg = str(exp)
            if msg not in self._errpickle.get(node, ""):
                self._errpickle.add(node, msg)
Esempio n. 3
0
    def ev_close(self, worker):
        """
        Check process termination status and set action status.
        """
        Action.ev_close(self, worker)

        self.server.lustre_check()

        # Action timed out
        if worker.did_timeout():
            self.server.action_event(self, 'timeout')
            self.set_status(ACT_ERROR)

        # Action succeeded
        elif worker.retcode() == 0:
            result = Result(duration=self.duration, retcode=worker.retcode())
            self.server.action_event(self, 'done', result)
            self.set_status(ACT_OK)

        # Action failed
        else:
            result = ErrorResult(worker.read(), self.duration,
                                 worker.retcode())
            self.server.action_event(self, 'failed', result)
            self.set_status(ACT_ERROR)
Esempio n. 4
0
    def ev_close(self, worker):
        """
        Check process termination status and generate appropriate events.
        """
        Action.ev_close(self, worker)

        # Action timed out
        if worker.did_timeout():
            nodes = NodeSet.fromlist(worker.iter_keys_timeout())
            self.fs._handle_shine_proxy_error(nodes, "Nodes timed out")
            self.set_status(ACT_ERROR)

        # Action succeeded
        elif max(rc for rc, _ in worker.iter_retcodes()) == 0:
            self.set_status(ACT_OK)

        # Action failed
        else:
            for rc, nodes in worker.iter_retcodes():
                if rc == 0:
                    continue

                # Avoid warnings, flag this component in error state
                for comp in self._comps or []:
                    comp.sanitize_state(nodes=worker.nodes)

                for output, nodes in worker.iter_buffers(match_keys=nodes):
                    nodes = NodeSet.fromlist(nodes)
                    msg = "Copy failed: %s" % output
                    self.fs._handle_shine_proxy_error(nodes, msg)
            self.set_status(ACT_ERROR)
Esempio n. 5
0
    def ev_close(self, worker):
        """
        Check process termination status and generate appropriate events.

        Note that if fsck has correctly fixed some errors, actions will be
        considered as successful.
        """

        if worker.did_timeout():
            return FSAction.ev_close(self, worker)

        # We want to skip FSAction.ev_close(), just call the upper layer.
        Action.ev_close(self, worker)

        self.comp.lustre_check()

        # fsck returns 0=NOERROR, 1=OK_BUT_CORRECTION, 2=OK_BUT_REBOOT.
        # see man fsck.
        if worker.retcode() in (0, 1, 2, 4):
            # action succeeded
            result = Result(duration=self.duration, retcode=worker.retcode())
            if worker.retcode() in (1, 2):
                result.message = "Errors corrected"
            if worker.retcode() == 4: # -n
                result.message = "Errors found but NOT corrected"
            self.comp.action_event(self, 'done', result)
            self.set_status(ACT_OK)
        else:
            # action failed
            msg = "\n".join(self._output)
            result = ErrorResult(msg, self.duration, worker.retcode())
            self.comp.action_event(self, 'failed', result)
            self.set_status(ACT_ERROR)
Esempio n. 6
0
    def ev_read(self, worker):
        node = worker.current_node
        buf = worker.current_msg
        try:
            data = shine_msg_unpack(buf)

            # COMPAT: Prior to 1.4, 'comp'+'action' was used.
            # 1.4+ uses ActionInfo
            if 'comp' in data:
                action = Action()
                action.NAME = data.pop('action')
                comp = data.pop('comp')
                comp.fs = self.fs
                desc = "%s of %s" % (action.NAME, comp.longtext())
                data['info'] = ActionInfo(action, comp, desc)
                evtype = 'comp'
            else:
                evtype = data.pop('evtype')

            self.fs.distant_event(evtype, node=node, **data)
        except ProxyActionUnpickleError as exp:
            # Maintain a standalone list of unpickling errors.
            # Node could have unpickling error but still exit with 0
            msg = str(exp)
            if msg not in self._errpickle.get(node, ""):
                self._errpickle.add(node, msg)
        except AttributeError as exp:
            msg = "Cannot read message (check Shine and ClusterShell " \
                  "version): %s" % str(exp)
            if msg not in self._errpickle.get(node, ""):
                self._errpickle.add(node, msg)
        except ProxyActionUnpackError:
            # Store output that is not a shine message
            self._outputs.add(node, buf)
Esempio n. 7
0
    def ev_close(self, worker):
        """
        Check process termination status and generate appropriate events.
        """
        Action.ev_close(self, worker)

        # Action timed out
        if worker.did_timeout():
            nodes = NodeSet.fromlist(worker.iter_keys_timeout())
            self.fs._handle_shine_proxy_error(nodes, "Nodes timed out")
            self.set_status(ACT_ERROR)

        # Action succeeded
        elif max(rc for rc, _ in worker.iter_retcodes()) == 0:
            self.set_status(ACT_OK)

        # Action failed
        else:
            for rc, nodes in worker.iter_retcodes():
                if rc == 0:
                    continue

                # Avoid warnings, flag this component in error state
                for comp in self._comps or []:
                    comp.sanitize_state(nodes=worker.nodes)

                for output, nodes in worker.iter_buffers(match_keys=nodes):
                    nodes = NodeSet.fromlist(nodes)
                    msg = "Copy failed: %s" % output
                    self.fs._handle_shine_proxy_error(nodes, msg)
            self.set_status(ACT_ERROR)
Esempio n. 8
0
 def ev_start(self, worker):
     Action.ev_start(self, worker)
     name = os.path.basename(self.config_file)
     if len(self.nodes) > 8:
         print "Updating configuration file `%s' on %d server(s)" % \
                                                     (name, len(self.nodes))
     else:
         print "Updating configuration file `%s' on %s" % (name, self.nodes)
Esempio n. 9
0
    def ev_close(self, worker):
        """End of proxy command."""
        Action.ev_close(self, worker)

        # Before all, we must check if shine command ran without bugs, node
        # crash, etc...
        # So we need to verify all node retcodes and change the component state
        # on the bad nodes.

        # Action timed out
        if worker.did_timeout():
            self.set_status(ACT_ERROR)
            return

        status = ACT_OK

        # Remove the 'proxy' running action for each component.
        if self._comps:
            for comp in self._comps:
                # This special event helps to keep track of undergoing actions
                # (see ev_start())
                comp.action_event(self, 'done')
                comp.sanitize_state(nodes=worker.nodes)

        # Gather nodes by return code
        for rc, nodes in worker.iter_retcodes():
            # Remote command returns only RUNTIME_ERROR (See RemoteCommand)
            # some common remote errors:
            # rc 127 = command not found
            # rc 126 = found but not executable
            # rc 1 = python failure...
            if rc != 0:

                # If there is at least one error, the action is on error.
                status = ACT_ERROR

                # Gather these nodes by buffer
                key = nodes.__contains__
                for buffers, nodes in self._outputs.walk(match=key):
                    # Handle proxy command error
                    nodes = NodeSet.fromlist(nodes)
                    msg = "Remote action %s failed: %s\n" % \
                                                        (self.action, buffers)
                    self.fs._handle_shine_proxy_error(nodes, msg)

        # Raise errors for each unpickling error,
        # which could happen mostly when Shine exits with 0.
        for buffers, nodes in self._errpickle.walk():
            nodes = NodeSet.fromlist(nodes)
            self.fs._handle_shine_proxy_error(nodes, str(buffers))

        # Raise an error for nodes without output
        if len(self._silentnodes) > 0:
            msg = "Remote action %s failed: No response" % self.action
            self.fs._handle_shine_proxy_error(self._silentnodes, msg)

        self.set_status(status)
Esempio n. 10
0
    def ev_close(self, worker):
        """
        Check process termination status and generate appropriate events.
        """
        Action.ev_close(self, worker)

        # Action timed out
        if worker.did_timeout():
            self.set_status(ACT_ERROR)

        # Action succeeded
        elif max(rc for rc, _ in worker.iter_retcodes()) == 0:
            self.set_status(ACT_OK)

        # Action failed
        else:
            self.set_status(ACT_ERROR)
Esempio n. 11
0
    def ev_close(self, worker):
        """
        Check process termination status and generate appropriate events.
        """
        Action.ev_close(self, worker)

        # Action timed out
        if worker.did_timeout():
            self.set_status(ACT_ERROR)

        # Action succeeded
        elif max(rc for rc, _ in worker.iter_retcodes()) == 0:
            self.set_status(ACT_OK)

        # Action failed
        else:
            self.set_status(ACT_ERROR)
Esempio n. 12
0
    def ev_close(self, worker):
        """
        Check process termination status and set action status.
        """
        Action.ev_close(self, worker)

        self.server.lustre_check()

        # Action timed out
        if worker.did_timeout():
            self.server.action_event(self, 'timeout')
            self.set_status(ACT_ERROR)

        # Action succeeded
        elif worker.retcode() == 0:
            result = Result(duration=self.duration, retcode=worker.retcode())
            self.server.action_event(self, 'done', result)
            self.set_status(ACT_OK)

        # Action failed
        else:
            result = ErrorResult(worker.read(), self.duration, worker.retcode())
            self.server.action_event(self, 'failed', result)
            self.set_status(ACT_ERROR)
Esempio n. 13
0
 def __init__(self, nodes, fs, config_file):
     Action.__init__(self)
     self.nodes = nodes
     self.fs = fs
     self.config_file = config_file
Esempio n. 14
0
    def ev_close(self, worker):
        """End of proxy command."""
        Action.ev_close(self, worker)

        # Before all, we must check if shine command ran without bugs, node
        # crash, etc...
        # So we need to verify all node retcodes and change the component state
        # on the bad nodes.

        # Action timed out
        if worker.did_timeout():
            self.set_status(ACT_ERROR)
            return

        status = ACT_OK

        # Remove the 'proxy' running action for each component.
        if self._comps:
            for comp in self._comps:
                # XXX: This should be changed using a real event for proxy.
                comp._del_action('proxy')

                if comp.state is None:
                    comp.state = RUNTIME_ERROR

                # At this step, there should be no more INPROGRESS component.
                # If yes, this is a bug, change state to RUNTIME_ERROR.
                # INPROGRESS management could be change using running action
                # list.
                # Starting with v1.3, there is no more code setting INPROGRESS.
                # This is for compatibility with older clients.
                elif comp.state == INPROGRESS:
                    actions = ""
                    if len(comp._list_action()):
                        actions = "actions: " + ", ".join(comp._list_action())
                    print >> sys.stderr, "ERROR: bad state for %s: %d %s" % \
                                    (comp.label, comp.state, actions)
                    comp.state = RUNTIME_ERROR

        # Gather nodes by return code
        for rc, nodes in worker.iter_retcodes():
            # Remote command returns only RUNTIME_ERROR (See RemoteCommand)
            # some common remote errors:
            # rc 127 = command not found
            # rc 126 = found but not executable
            # rc 1 = python failure...
            if rc != 0:

                # If there is at least one error, the action is on error.
                status = ACT_ERROR

                # Gather these nodes by buffer
                key = nodes.__contains__
                for buffers, nodes in self._outputs.walk(match=key):
                    # Handle proxy command error
                    nodes = NodeSet.fromlist(nodes)
                    msg = "Remote action %s failed: %s\n" % \
                                                        (self.action, buffers)
                    self.fs._handle_shine_proxy_error(nodes, msg)

        # Raise errors for each unpickling error,
        # which could happen mostly when Shine exits with 0.
        for buffers, nodes in self._errpickle.walk():
            nodes = NodeSet.fromlist(nodes)
            self.fs._handle_shine_proxy_error(nodes, str(buffers))

        # Raise an error for nodes without output
        if len(self._silentnodes) > 0:
            msg = "Remote action %s failed: No response" % self.action
            self.fs._handle_shine_proxy_error(self._silentnodes, msg)

        self.set_status(status)