def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid)))
def service_health_to_m0dstatus_update(self, proc_fid: Fid, svc_health: ServiceHealth): ev = ConfHaProcess(chp_event=self.svcHealthToM0Status(svc_health), chp_type=int( m0HaProcessType.M0_CONF_HA_PROCESS_M0D), chp_pid=0, fid=proc_fid) self.update_process_status(ev)
def update_process_failure(self, ha_states: List[HAState]) -> None: for state in ha_states: if state.status == ServiceHealth.FAILED: m0status = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED pevent = ConfHaProcess(chp_event=m0status, chp_type=3, chp_pid=0, fid=state.fid) self._update_process_status(pevent)
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] for state in ha_states: # We are only concerned with process statuses. if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ServiceHealth.OK: if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STARTED'): continue if current_status in (ServiceHealth.FAILED, ServiceHealth.STOPPED): if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'): # Consul may report failure of a process multiple # times, so we don't want to send duplicate failure # notifications, it may cause delay in cleanup # activities. continue # XXX: # Sometime, there can be situation where Consul event is sent # and can be delayed, where state reported by Consul for a # given process can be in its past already, e.g. consul # reported process failure but when hax received the event, # process might have already restarted. In this case the event # still needs to be handled. Also, it is possible that Consul # reported failure but process status is not yet updated in # Consul services catalog, in such a case the reported status # can be true and cannot be just dropped. These scenarios must # be re-visited. if current_status not in (ServiceHealth.UNKNOWN, ServiceHealth.OFFLINE): # We also need to account and report the failure of remote # Motr processes to this node's hax and motr processes. # When Consul reports a remote process failure, hax # confirms its current status from Consul KV and updates # the list of failed services and also adds it to the # broadcast list. if current_status != ServiceHealth.OK: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED else: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED self.consul.update_process_status( ConfHaProcess( chp_event=event, chp_type=int( m0HaProcessType.M0_CONF_HA_PROCESS_M0D), chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: new_ha_states.append(state) return new_ha_states
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): logging.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_event == 3: self.queue.put( BroadcastHAStates(states=[HAState(fid=fid, status='offline')], reply_to=None))
def service_health_to_m0dstatus_update(self, proc_fid: Fid, svc_health: ServiceHealth): svcHealthToM0status: dict = { ServiceHealth.OK: m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED, ServiceHealth.FAILED: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED, ServiceHealth.UNKNOWN: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED } ev = ConfHaProcess(chp_event=svcHealthToM0status[svc_health], chp_type=m0HaProcessType.M0_CONF_HA_PROCESS_M0D, chp_pid=0, fid=proc_fid) self.update_process_status(ev)
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_event == 3: self.queue.put( BroadcastHAStates( states=[HAState(fid=fid, status=ServiceHealth.FAILED)], reply_to=None))
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.planner.add_command( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED: proc_ep = self.consul_util.fid_to_endpoint(fid) if proc_ep: self._ffi.hax_link_stopped(self._ha_ctx, make_c_str(proc_ep))
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D: if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED: self.queue.put( BroadcastHAStates( states=[HAState(fid=fid, status=ServiceHealth.OK)], reply_to=None))
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] proc_Health_to_status = { ObjHealth.OFFLINE: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED, ObjHealth.FAILED: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED, ObjHealth.OK: m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED } try: for state in ha_states: if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ObjHealth.UNKNOWN: continue proc_status_remote = self.consul.get_process_status( state.fid) proc_status: Any = None # MKFS states are upated by the node corresponding to a # given process. So we ignore notifications for mkfs # processes. if proc_status_remote.proc_type in ( 'Unknown', m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS.name): continue proc_type = m0HaProcessType.str_to_Enum( proc_status_remote.proc_type) # Following cases are handled here, # 1. Delayed consul service failure notification: # - We re-confirm the current process state before # notifying the process as offline/failed. # 2. Consul reported process failure, current process # state is offline (this means the corresponding node # is online, i.e. hax and consul are online): # - So process's status in consul kv might not be updated # as the process died abruptly. In this case we handle # it as local process failure, update the process # status in consul kv and notify motr. # 3. Consul reported process failure, current process # state is failed (this means the node corresponding to # the process also failed, i.e. hax and consul are no # more): # - Process's status in consul kv might not be updated as # the node went down abruptly. In this case, when # consul reports failure for corresponding node # processes, Hare verifies the node status and # accordingly Hare RC node processes the failures. # This may take some time if Consul server loose # the quorum and take time sync up the state. # 4. Consul reported process failure, probably due to mkfs # process completion (m0tr mkfs and m0ds share the same # fid). which got delayed and process has starting now: # - Hare checks the current status of the process but it # is possible that the process state is not synced up # yet within the quorum. In this case, we continue # processing the failure event but once the process # starts successfully Hare will update and notify the # process state eventually. # 5. For some reason Consul may report a process as # offline and subsequently report it as online, this # may happen due to intermittent monitor failure: # - Hare must handle the change in process states # accordingly in-order to maintain the eventual # consistency of the cluster state. proc_status = proc_Health_to_status.get(current_status) LOG.debug('current_status: %s proc_status_remote: %s', current_status, proc_status_remote.proc_status) if proc_status is not None: LOG.debug('proc_status: %s', proc_status.name) if proc_status_remote.proc_status != proc_status.name: if (self.consul.am_i_rc() or self.consul.is_proc_local(state.fid)): # Probably process node failed, in such a # case, only RC must be allowed to update # the process's persistent state. # Or, if the node's alive then allow the node # to update the local process's state. self.consul.update_process_status( ConfHaProcess(chp_event=proc_status, chp_type=proc_type, chp_pid=0, fid=state.fid)) # RC or not RC, i.e. even without persistent state # update, it is important that the notification to # local motr processes must still be sent. new_ha_states.append( HAState(fid=state.fid, status=current_status)) if not self.consul.is_proc_local(state.fid): proc_status_local = ( self.consul.get_process_status_local( state.fid)) # Consul monitors a process every 1 second and # this notification is sent to every node. Thus # to avoid notifying about a process multiple # times about the same status every node # maintains a local copy of the remote process # status, which is checked everytime a consul # notification is received and accordingly # the status is notified locally to all the local # motr processes. if (proc_status_local.proc_status != proc_status.name): self.consul.update_process_status_local( ConfHaProcess(chp_event=proc_status, chp_type=proc_type, chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: continue else: new_ha_states.append(state) except Exception as e: raise HAConsistencyException('failed to process ha states') from e return new_ha_states
def process_event(): return ProcessEvent( ConfHaProcess(chp_event=0, chp_type=0, chp_pid=0, fid=Fid(0, 0)))
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] for state in ha_states: # We are only concerned with process statuses. if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ServiceHealth.OK: if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STARTED'): continue if current_status in (ServiceHealth.FAILED, ServiceHealth.STOPPED): if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'): # Consul may report failure of a process multiple # times, so we don't want to send duplicate failure # notifications, it may cause delay in cleanup # activities. continue if current_status == ServiceHealth.UNKNOWN: # We got service status as UNKNOWN, that means hax was # notified about process failure but hax couldn't # confirm if the process is in failed state or have # failed and restarted. So, we will not loose the # event and try again to confirm the real time # process status by enqueing a broadcast event # specific to this process. # It is expected that the process status gets # eventually confirmed as either failed or passing (OK). # This situation typically arises due to delay # in receiving failure notification during which the # corresponding process might be restarting or have # already restarted. Thus it is important to confirm # the real time status of the process before # broadcasting failure. current_status = ServiceHealth.UNKNOWN planner.add_command( BroadcastHAStates(states=[ HAState(fid=state.fid, status=ServiceHealth.FAILED) ], reply_to=None)) if current_status not in (ServiceHealth.UNKNOWN, ServiceHealth.OFFLINE): # We also need to account and report the failure of remote # Motr processes to this node's hax and motr processes. # When Consul reports a remote process failure, hax # confirms its current status from Consul KV and updates # the list of failed services and also adds it to the # broadcast list. if current_status != ServiceHealth.OK: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED else: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED self.consul.update_process_status( ConfHaProcess( chp_event=event, chp_type=int( m0HaProcessType.M0_CONF_HA_PROCESS_M0D), chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: new_ha_states.append(state) return new_ha_states