Ejemplo n.º 1
0
 def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
     LOG.info('fid=%s, chp_event=%s', fid, chp_event)
     self.queue.put(
         ProcessEvent(
             ConfHaProcess(chp_event=chp_event,
                           chp_type=chp_type,
                           chp_pid=chp_pid,
                           fid=fid)))
Ejemplo n.º 2
0
 def service_health_to_m0dstatus_update(self, proc_fid: Fid,
                                        svc_health: ServiceHealth):
     ev = ConfHaProcess(chp_event=self.svcHealthToM0Status(svc_health),
                        chp_type=int(
                            m0HaProcessType.M0_CONF_HA_PROCESS_M0D),
                        chp_pid=0,
                        fid=proc_fid)
     self.update_process_status(ev)
Ejemplo n.º 3
0
 def update_process_failure(self, ha_states: List[HAState]) -> None:
     for state in ha_states:
         if state.status == ServiceHealth.FAILED:
             m0status = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
             pevent = ConfHaProcess(chp_event=m0status,
                                    chp_type=3,
                                    chp_pid=0,
                                    fid=state.fid)
             self._update_process_status(pevent)
Ejemplo n.º 4
0
 def update_process_failure(self, planner: WorkPlanner,
                            ha_states: List[HAState]) -> List[HAState]:
     new_ha_states: List[HAState] = []
     for state in ha_states:
         # We are only concerned with process statuses.
         if state.fid.container == ObjT.PROCESS.value:
             current_status = self.consul.get_process_current_status(
                 state.status, state.fid)
             if current_status == ServiceHealth.OK:
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STARTED'):
                     continue
             if current_status in (ServiceHealth.FAILED,
                                   ServiceHealth.STOPPED):
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'):
                     # Consul may report failure of a process multiple
                     # times, so we don't want to send duplicate failure
                     # notifications, it may cause delay in cleanup
                     # activities.
                     continue
             # XXX:
             # Sometime, there can be situation where Consul event is sent
             # and can be delayed, where state reported by Consul for a
             # given process can be in its past already, e.g. consul
             # reported process failure but when hax received the event,
             # process might have already restarted. In this case the event
             # still needs to be handled. Also, it is possible that Consul
             # reported failure but process status is not yet updated in
             # Consul services catalog, in such a case the reported status
             # can be true and cannot be just dropped. These scenarios must
             # be re-visited.
             if current_status not in (ServiceHealth.UNKNOWN,
                                       ServiceHealth.OFFLINE):
                 # We also need to account and report the failure of remote
                 # Motr processes to this node's hax and motr processes.
                 # When Consul reports a remote process failure, hax
                 # confirms its current status from Consul KV and updates
                 # the list of failed services and also adds it to the
                 # broadcast list.
                 if current_status != ServiceHealth.OK:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
                 else:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
                 self.consul.update_process_status(
                     ConfHaProcess(
                         chp_event=event,
                         chp_type=int(
                             m0HaProcessType.M0_CONF_HA_PROCESS_M0D),
                         chp_pid=0,
                         fid=state.fid))
             new_ha_states.append(
                 HAState(fid=state.fid, status=current_status))
         else:
             new_ha_states.append(state)
     return new_ha_states
Ejemplo n.º 5
0
 def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
     logging.info('fid=%s, chp_event=%s', fid, chp_event)
     self.queue.put(
         ProcessEvent(
             ConfHaProcess(chp_event=chp_event,
                           chp_type=chp_type,
                           chp_pid=chp_pid,
                           fid=fid)))
     if chp_event == 3:
         self.queue.put(
             BroadcastHAStates(states=[HAState(fid=fid, status='offline')],
                               reply_to=None))
Ejemplo n.º 6
0
 def service_health_to_m0dstatus_update(self, proc_fid: Fid,
                                        svc_health: ServiceHealth):
     svcHealthToM0status: dict = {
         ServiceHealth.OK: m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED,
         ServiceHealth.FAILED: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED,
         ServiceHealth.UNKNOWN: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
     }
     ev = ConfHaProcess(chp_event=svcHealthToM0status[svc_health],
                        chp_type=m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                        chp_pid=0,
                        fid=proc_fid)
     self.update_process_status(ev)
Ejemplo n.º 7
0
 def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
     LOG.info('fid=%s, chp_event=%s', fid, chp_event)
     self.queue.put(
         ProcessEvent(
             ConfHaProcess(chp_event=chp_event,
                           chp_type=chp_type,
                           chp_pid=chp_pid,
                           fid=fid)))
     if chp_event == 3:
         self.queue.put(
             BroadcastHAStates(
                 states=[HAState(fid=fid, status=ServiceHealth.FAILED)],
                 reply_to=None))
Ejemplo n.º 8
0
    def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
        LOG.info('fid=%s, chp_event=%s', fid, chp_event)
        self.planner.add_command(
            ProcessEvent(
                ConfHaProcess(chp_event=chp_event,
                              chp_type=chp_type,
                              chp_pid=chp_pid,
                              fid=fid)))

        if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED:
            proc_ep = self.consul_util.fid_to_endpoint(fid)
            if proc_ep:
                self._ffi.hax_link_stopped(self._ha_ctx, make_c_str(proc_ep))
Ejemplo n.º 9
0
    def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
        LOG.info('fid=%s, chp_event=%s', fid, chp_event)
        self.queue.put(
            ProcessEvent(
                ConfHaProcess(chp_event=chp_event,
                              chp_type=chp_type,
                              chp_pid=chp_pid,
                              fid=fid)))

        if chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D:
            if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED:
                self.queue.put(
                    BroadcastHAStates(
                        states=[HAState(fid=fid, status=ServiceHealth.OK)],
                        reply_to=None))
Ejemplo n.º 10
0
 def update_process_failure(self, planner: WorkPlanner,
                            ha_states: List[HAState]) -> List[HAState]:
     new_ha_states: List[HAState] = []
     proc_Health_to_status = {
         ObjHealth.OFFLINE: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED,
         ObjHealth.FAILED: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED,
         ObjHealth.OK: m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
     }
     try:
         for state in ha_states:
             if state.fid.container == ObjT.PROCESS.value:
                 current_status = self.consul.get_process_current_status(
                     state.status, state.fid)
                 if current_status == ObjHealth.UNKNOWN:
                     continue
                 proc_status_remote = self.consul.get_process_status(
                                          state.fid)
                 proc_status: Any = None
                 # MKFS states are upated by the node corresponding to a
                 # given process. So we ignore notifications for mkfs
                 # processes.
                 if proc_status_remote.proc_type in (
                         'Unknown',
                         m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS.name):
                     continue
                 proc_type = m0HaProcessType.str_to_Enum(
                      proc_status_remote.proc_type)
                 # Following cases are handled here,
                 # 1. Delayed consul service failure notification:
                 # -  We re-confirm the current process state before
                 #     notifying the process as offline/failed.
                 # 2. Consul reported process failure, current process
                 #    state is offline (this means the corresponding node
                 #    is online, i.e. hax and consul are online):
                 # -  So process's status in consul kv might not be updated
                 #    as the process died abruptly. In this case we handle
                 #    it as local process failure, update the process
                 #    status in consul kv and notify motr.
                 # 3. Consul reported process failure, current process
                 #    state is failed (this means the node corresponding to
                 #    the process also failed, i.e. hax and consul are no
                 #    more):
                 # -  Process's status in consul kv might not be updated as
                 #    the node went down abruptly. In this case, when
                 #    consul reports failure for corresponding node
                 #    processes, Hare verifies the node status and
                 #    accordingly Hare RC node processes the failures.
                 #    This may take some time if Consul server loose
                 #    the quorum and take time sync up the state.
                 # 4. Consul reported process failure, probably due to mkfs
                 #    process completion (m0tr mkfs and m0ds share the same
                 #    fid). which got delayed and process has starting now:
                 # -  Hare checks the current status of the process but it
                 #    is possible that the process state is not synced up
                 #    yet within the quorum. In this case, we continue
                 #    processing the failure event but once the process
                 #    starts successfully Hare will update and notify the
                 #    process state eventually.
                 # 5. For some reason Consul may report a process as
                 #    offline and subsequently report it as online, this
                 #    may happen due to intermittent monitor failure:
                 # -  Hare must handle the change in process states
                 #    accordingly in-order to maintain the eventual
                 #    consistency of the cluster state.
                 proc_status = proc_Health_to_status.get(current_status)
                 LOG.debug('current_status: %s proc_status_remote: %s',
                           current_status, proc_status_remote.proc_status)
                 if proc_status is not None:
                     LOG.debug('proc_status: %s', proc_status.name)
                     if proc_status_remote.proc_status != proc_status.name:
                         if (self.consul.am_i_rc() or
                                 self.consul.is_proc_local(state.fid)):
                             # Probably process node failed, in such a
                             # case, only RC must be allowed to update
                             # the process's persistent state.
                             # Or, if the node's alive then allow the node
                             # to update the local process's state.
                             self.consul.update_process_status(
                                 ConfHaProcess(chp_event=proc_status,
                                               chp_type=proc_type,
                                               chp_pid=0,
                                               fid=state.fid))
                         # RC or not RC, i.e. even without persistent state
                         # update, it is important that the notification to
                         # local motr processes must still be sent.
                         new_ha_states.append(
                             HAState(fid=state.fid, status=current_status))
                     if not self.consul.is_proc_local(state.fid):
                         proc_status_local = (
                             self.consul.get_process_status_local(
                                 state.fid))
                         # Consul monitors a process every 1 second and
                         # this notification is sent to every node. Thus
                         # to avoid notifying about a process multiple
                         # times about the same status every node
                         # maintains a local copy of the remote process
                         # status, which is checked everytime a consul
                         # notification is received and accordingly
                         # the status is notified locally to all the local
                         # motr processes.
                         if (proc_status_local.proc_status !=
                                 proc_status.name):
                             self.consul.update_process_status_local(
                                 ConfHaProcess(chp_event=proc_status,
                                               chp_type=proc_type,
                                               chp_pid=0,
                                               fid=state.fid))
                             new_ha_states.append(
                                 HAState(fid=state.fid,
                                         status=current_status))
                     else:
                         continue
             else:
                 new_ha_states.append(state)
     except Exception as e:
         raise HAConsistencyException('failed to process ha states') from e
     return new_ha_states
Ejemplo n.º 11
0
def process_event():
    return ProcessEvent(
               ConfHaProcess(chp_event=0,
                             chp_type=0,
                             chp_pid=0,
                             fid=Fid(0, 0)))
Ejemplo n.º 12
0
 def update_process_failure(self, planner: WorkPlanner,
                            ha_states: List[HAState]) -> List[HAState]:
     new_ha_states: List[HAState] = []
     for state in ha_states:
         # We are only concerned with process statuses.
         if state.fid.container == ObjT.PROCESS.value:
             current_status = self.consul.get_process_current_status(
                 state.status, state.fid)
             if current_status == ServiceHealth.OK:
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STARTED'):
                     continue
             if current_status in (ServiceHealth.FAILED,
                                   ServiceHealth.STOPPED):
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'):
                     # Consul may report failure of a process multiple
                     # times, so we don't want to send duplicate failure
                     # notifications, it may cause delay in cleanup
                     # activities.
                     continue
             if current_status == ServiceHealth.UNKNOWN:
                 # We got service status as UNKNOWN, that means hax was
                 # notified about process failure but hax couldn't
                 # confirm if the process is in failed state or have
                 # failed and restarted. So, we will not loose the
                 # event and try again to confirm the real time
                 # process status by enqueing a broadcast event
                 # specific to this process.
                 # It is expected that the process status gets
                 # eventually confirmed as either failed or passing (OK).
                 # This situation typically arises due to delay
                 # in receiving failure notification during which the
                 # corresponding process might be restarting or have
                 # already restarted. Thus it is important to confirm
                 # the real time status of the process before
                 # broadcasting failure.
                 current_status = ServiceHealth.UNKNOWN
                 planner.add_command(
                     BroadcastHAStates(states=[
                         HAState(fid=state.fid, status=ServiceHealth.FAILED)
                     ],
                         reply_to=None))
             if current_status not in (ServiceHealth.UNKNOWN,
                                       ServiceHealth.OFFLINE):
                 # We also need to account and report the failure of remote
                 # Motr processes to this node's hax and motr processes.
                 # When Consul reports a remote process failure, hax
                 # confirms its current status from Consul KV and updates
                 # the list of failed services and also adds it to the
                 # broadcast list.
                 if current_status != ServiceHealth.OK:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
                 else:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
                 self.consul.update_process_status(
                     ConfHaProcess(
                         chp_event=event,
                         chp_type=int(
                             m0HaProcessType.M0_CONF_HA_PROCESS_M0D),
                         chp_pid=0,
                         fid=state.fid))
             new_ha_states.append(
                 HAState(fid=state.fid, status=current_status))
         else:
             new_ha_states.append(state)
     return new_ha_states