def ev_close(self, worker): """ Check process termination status and set action status. """ Action.ev_close(self, worker) self.server.lustre_check() # Action timed out if worker.did_timeout(): self.server.action_event(self, 'timeout') self.set_status(ACT_ERROR) # Action succeeded elif worker.retcode() == 0: result = Result(duration=self.duration, retcode=worker.retcode()) self.server.action_event(self, 'done', result) self.set_status(ACT_OK) # Action failed else: result = ErrorResult(worker.read(), self.duration, worker.retcode()) self.server.action_event(self, 'failed', result) self.set_status(ACT_ERROR)
def ev_close(self, worker): """ Check process termination status and generate appropriate events. Note that if fsck has correctly fixed some errors, actions will be considered as successful. """ if worker.did_timeout(): return FSAction.ev_close(self, worker) # We want to skip FSAction.ev_close(), just call the upper layer. Action.ev_close(self, worker) self.comp.lustre_check() # fsck returns 0=NOERROR, 1=OK_BUT_CORRECTION, 2=OK_BUT_REBOOT. # see man fsck. if worker.retcode() in (0, 1, 2, 4): # action succeeded result = Result(duration=self.duration, retcode=worker.retcode()) if worker.retcode() in (1, 2): result.message = "Errors corrected" if worker.retcode() == 4: # -n result.message = "Errors found but NOT corrected" self.comp.action_event(self, 'done', result) self.set_status(ACT_OK) else: # action failed msg = "\n".join(self._output) result = ErrorResult(msg, self.duration, worker.retcode()) self.comp.action_event(self, 'failed', result) self.set_status(ACT_ERROR)
def ev_close(self, worker): """ Check process termination status and generate appropriate events. """ Action.ev_close(self, worker) # Action timed out if worker.did_timeout(): nodes = NodeSet.fromlist(worker.iter_keys_timeout()) self.fs._handle_shine_proxy_error(nodes, "Nodes timed out") self.set_status(ACT_ERROR) # Action succeeded elif max(rc for rc, _ in worker.iter_retcodes()) == 0: self.set_status(ACT_OK) # Action failed else: for rc, nodes in worker.iter_retcodes(): if rc == 0: continue # Avoid warnings, flag this component in error state for comp in self._comps or []: comp.sanitize_state(nodes=worker.nodes) for output, nodes in worker.iter_buffers(match_keys=nodes): nodes = NodeSet.fromlist(nodes) msg = "Copy failed: %s" % output self.fs._handle_shine_proxy_error(nodes, msg) self.set_status(ACT_ERROR)
def ev_close(self, worker): """End of proxy command.""" Action.ev_close(self, worker) # Before all, we must check if shine command ran without bugs, node # crash, etc... # So we need to verify all node retcodes and change the component state # on the bad nodes. # Action timed out if worker.did_timeout(): self.set_status(ACT_ERROR) return status = ACT_OK # Remove the 'proxy' running action for each component. if self._comps: for comp in self._comps: # This special event helps to keep track of undergoing actions # (see ev_start()) comp.action_event(self, 'done') comp.sanitize_state(nodes=worker.nodes) # Gather nodes by return code for rc, nodes in worker.iter_retcodes(): # Remote command returns only RUNTIME_ERROR (See RemoteCommand) # some common remote errors: # rc 127 = command not found # rc 126 = found but not executable # rc 1 = python failure... if rc != 0: # If there is at least one error, the action is on error. status = ACT_ERROR # Gather these nodes by buffer key = nodes.__contains__ for buffers, nodes in self._outputs.walk(match=key): # Handle proxy command error nodes = NodeSet.fromlist(nodes) msg = "Remote action %s failed: %s\n" % \ (self.action, buffers) self.fs._handle_shine_proxy_error(nodes, msg) # Raise errors for each unpickling error, # which could happen mostly when Shine exits with 0. for buffers, nodes in self._errpickle.walk(): nodes = NodeSet.fromlist(nodes) self.fs._handle_shine_proxy_error(nodes, str(buffers)) # Raise an error for nodes without output if len(self._silentnodes) > 0: msg = "Remote action %s failed: No response" % self.action self.fs._handle_shine_proxy_error(self._silentnodes, msg) self.set_status(status)
def ev_close(self, worker): """ Check process termination status and generate appropriate events. """ Action.ev_close(self, worker) # Action timed out if worker.did_timeout(): self.set_status(ACT_ERROR) # Action succeeded elif max(rc for rc, _ in worker.iter_retcodes()) == 0: self.set_status(ACT_OK) # Action failed else: self.set_status(ACT_ERROR)
def ev_close(self, worker): """End of proxy command.""" Action.ev_close(self, worker) # Before all, we must check if shine command ran without bugs, node # crash, etc... # So we need to verify all node retcodes and change the component state # on the bad nodes. # Action timed out if worker.did_timeout(): self.set_status(ACT_ERROR) return status = ACT_OK # Remove the 'proxy' running action for each component. if self._comps: for comp in self._comps: # XXX: This should be changed using a real event for proxy. comp._del_action('proxy') if comp.state is None: comp.state = RUNTIME_ERROR # At this step, there should be no more INPROGRESS component. # If yes, this is a bug, change state to RUNTIME_ERROR. # INPROGRESS management could be change using running action # list. # Starting with v1.3, there is no more code setting INPROGRESS. # This is for compatibility with older clients. elif comp.state == INPROGRESS: actions = "" if len(comp._list_action()): actions = "actions: " + ", ".join(comp._list_action()) print >> sys.stderr, "ERROR: bad state for %s: %d %s" % \ (comp.label, comp.state, actions) comp.state = RUNTIME_ERROR # Gather nodes by return code for rc, nodes in worker.iter_retcodes(): # Remote command returns only RUNTIME_ERROR (See RemoteCommand) # some common remote errors: # rc 127 = command not found # rc 126 = found but not executable # rc 1 = python failure... if rc != 0: # If there is at least one error, the action is on error. status = ACT_ERROR # Gather these nodes by buffer key = nodes.__contains__ for buffers, nodes in self._outputs.walk(match=key): # Handle proxy command error nodes = NodeSet.fromlist(nodes) msg = "Remote action %s failed: %s\n" % \ (self.action, buffers) self.fs._handle_shine_proxy_error(nodes, msg) # Raise errors for each unpickling error, # which could happen mostly when Shine exits with 0. for buffers, nodes in self._errpickle.walk(): nodes = NodeSet.fromlist(nodes) self.fs._handle_shine_proxy_error(nodes, str(buffers)) # Raise an error for nodes without output if len(self._silentnodes) > 0: msg = "Remote action %s failed: No response" % self.action self.fs._handle_shine_proxy_error(self._silentnodes, msg) self.set_status(status)