Exemple #1
0
 def __init__(self, q: Queue, motr: Motr):
     super().__init__(target=self._do_work,
                      name='qconsumer',
                      args=(q, motr))
     self.is_stopped = False
     self.consul = ConsulUtil()
     self.eq_publisher = EQPublisher()
Exemple #2
0
 def __init__(self, q: Queue, motr: Motr, herald: DeliveryHerald,
              consul: ConsulUtil):
     super().__init__(target=self._do_work,
                      name='qconsumer',
                      args=(q, motr))
     self.is_stopped = False
     self.consul = consul
     self.eq_publisher = EQPublisher()
     self.herald = herald
Exemple #3
0
 def __init__(self, planner: WorkPlanner, motr: Motr,
              herald: DeliveryHerald, consul: ConsulUtil, idx: int):
     super().__init__(target=self._do_work,
                      name=f'qconsumer-{idx}',
                      args=(planner, motr))
     self.is_stopped = False
     self.consul = consul
     self.eq_publisher = EQPublisher()
     self.herald = herald
     self.idx = idx
Exemple #4
0
class ConsumerThread(StoppableThread):
    """
    The only Motr-aware thread in whole HaX. This thread pulls messages from
    the multithreaded Queue and considers the messages as commands. Every such
    a command describes what should be sent to Motr land.

    The thread exits gracefully when it receives message of type Die (i.e.
    it is a 'poison pill').
    """
    def __init__(self, q: Queue, motr: Motr, herald: DeliveryHerald,
                 consul: ConsulUtil):
        super().__init__(target=self._do_work,
                         name='qconsumer',
                         args=(q, motr))
        self.is_stopped = False
        self.consul = consul
        self.eq_publisher = EQPublisher()
        self.herald = herald

    def stop(self) -> None:
        self.is_stopped = True

    @repeat_if_fails(wait_seconds=1)
    def _update_process_status(self, q: Queue, event: ConfHaProcess) -> None:
        # If a consul-related exception appears, it will
        # be processed by repeat_if_fails.
        #
        # This thread will become blocked until that
        # intermittent error gets resolved.
        self.consul.update_process_status(event)
        svc_status = m0HaProcessEvent.event_to_svchealth(event.chp_event)
        if event.chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D:
            # Broadcast the received motr process status to other motr
            # processes in the cluster.
            q.put(
                BroadcastHAStates(
                    states=[HAState(fid=event.fid, status=svc_status)],
                    reply_to=None))

    @repeat_if_fails(wait_seconds=1)
    def update_process_failure(self, q: Queue,
                               ha_states: List[HAState]) -> List[HAState]:
        new_ha_states: List[HAState] = []
        for state in ha_states:
            # We are only concerned with process statuses.
            if state.fid.container == ObjT.PROCESS.value:
                current_status = self.consul.get_process_current_status(
                    state.status, state.fid)
                if current_status == ServiceHealth.FAILED:
                    self.consul.service_health_to_m0dstatus_update(
                        state.fid, current_status)
                elif current_status == ServiceHealth.UNKNOWN:
                    # We got service status as UNKNOWN, that means hax was
                    # notified about process failure but hax couldn't
                    # confirm if the process is in failed state or have
                    # failed and restarted. So, we will not loose the
                    # event and try again to confirm the real time
                    # process status by enqueing a broadcast event
                    # specific to this process.
                    # It is expected that the process status gets
                    # eventually confirmed as either failed or passing (OK).
                    # This situation typically arises due to delay
                    # in receiving failure notification during which the
                    # corresponding process might be restarting or have
                    # already restarted. Thus it is important to confirm
                    # the real time status of the process before
                    # broadcasting failure.
                    current_status = ServiceHealth.OK
                    q.put(
                        BroadcastHAStates(states=[
                            HAState(fid=state.fid, status=ServiceHealth.FAILED)
                        ],
                                          reply_to=None))
                new_ha_states.append(
                    HAState(fid=state.fid, status=current_status))
            else:
                new_ha_states.append(state)
        return new_ha_states

    def _do_work(self, q: Queue, motr: Motr):
        ffi = motr._ffi
        LOG.info('Handler thread has started')
        ffi.adopt_motr_thread()

        def pull_msg():
            try:
                return q.get(block=False)
            except Empty:
                return None

        try:
            while True:
                try:
                    LOG.debug('Waiting for the next message')

                    item = pull_msg()
                    while item is None:
                        time.sleep(0.2)
                        if self.is_stopped:
                            raise StopIteration()
                        item = pull_msg()

                    LOG.debug('Got %s message from queue', item)
                    if isinstance(item, FirstEntrypointRequest):
                        LOG.debug('first entrypoint request, broadcast FAILED')
                        ids: List[MessageId] = motr.broadcast_ha_states([
                            HAState(fid=item.process_fid,
                                    status=ServiceHealth.FAILED)
                        ])
                        LOG.debug('waiting for broadcast of %s for ep: %s',
                                  ids, item.remote_rpc_endpoint)
                        self.herald.wait_for_all(HaLinkMessagePromise(ids))
                        motr.send_entrypoint_request_reply(
                            EntrypointRequest(
                                reply_context=item.reply_context,
                                req_id=item.req_id,
                                remote_rpc_endpoint=item.remote_rpc_endpoint,
                                process_fid=item.process_fid,
                                git_rev=item.git_rev,
                                pid=item.pid,
                                is_first_request=item.is_first_request))
                    elif isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        self._update_process_status(q, item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        LOG.info('HA states: %s', item.states)
                        ha_states = self.update_process_failure(q, item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            ha_states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        LOG.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        LOG.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        LOG.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        LOG.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        LOG.info('SNS repair status is received: %s', status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        LOG.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        LOG.info('SNS rebalance status is received: %s',
                                 status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        LOG.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        LOG.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        LOG.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        LOG.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        LOG.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        LOG.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        LOG.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        LOG.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)

                    else:
                        LOG.warning('Unsupported event type received: %s',
                                    item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    LOG.exception('**ERROR**')
        except StopIteration:
            ffi.shun_motr_thread()
        finally:
            LOG.info('Handler thread has exited')
Exemple #5
0
class ConsumerThread(StoppableThread):
    """
    The only Motr-aware thread in whole HaX. This thread pulls messages from
    the multithreaded Queue and considers the messages as commands. Every such
    a command describes what should be sent to Motr land.

    The thread exits gracefully when it receives message of type Die (i.e.
    it is a 'poison pill').
    """
    def __init__(self, q: Queue, motr: Motr, herald: DeliveryHerald):
        super().__init__(target=self._do_work,
                         name='qconsumer',
                         args=(q, motr))
        self.is_stopped = False
        self.consul = ConsulUtil()
        self.eq_publisher = EQPublisher()
        self.herald = herald

    def stop(self) -> None:
        self.is_stopped = True

    @repeat_if_fails(wait_seconds=1)
    def _update_process_status(self, event: ConfHaProcess) -> None:
        # If a consul-related exception appears, it will
        # be processed by repeat_if_fails.
        #
        # This thread will become blocked until that
        # intermittent error gets resolved.
        self.consul.update_process_status(event)

    def update_process_failure(self, ha_states: List[HAState]) -> None:
        for state in ha_states:
            if state.status == ServiceHealth.FAILED:
                m0status = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
                pevent = ConfHaProcess(chp_event=m0status,
                                       chp_type=3,
                                       chp_pid=0,
                                       fid=state.fid)
                self._update_process_status(pevent)

    def _do_work(self, q: Queue, motr: Motr):
        ffi = motr._ffi
        LOG.info('Handler thread has started')
        ffi.adopt_motr_thread()

        def pull_msg():
            try:
                return q.get(block=False)
            except Empty:
                return None

        try:
            while True:
                try:
                    LOG.debug('Waiting for the next message')

                    item = pull_msg()
                    while item is None:
                        time.sleep(0.2)
                        if self.is_stopped:
                            raise StopIteration()
                        item = pull_msg()

                    LOG.debug('Got %s message from queue', item)
                    if isinstance(item, FirstEntrypointRequest):
                        LOG.debug('first entrypoint request, broadcast FAILED')
                        ids: List[MessageId] = motr.broadcast_ha_states([
                            HAState(fid=item.process_fid,
                                    status=ServiceHealth.FAILED)
                        ])
                        LOG.debug('waiting for broadcast of %s for ep: %s',
                                  ids, item.remote_rpc_endpoint)
                        self.herald.wait_for_all(HaLinkMessagePromise(ids))
                        motr.send_entrypoint_request_reply(
                            EntrypointRequest(
                                reply_context=item.reply_context,
                                req_id=item.req_id,
                                remote_rpc_endpoint=item.remote_rpc_endpoint,
                                process_fid=item.process_fid,
                                git_rev=item.git_rev,
                                pid=item.pid,
                                is_first_request=item.is_first_request))
                    elif isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        self._update_process_status(item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        LOG.info('HA states: %s', item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            item.states)
                        self.update_process_failure(item.states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        LOG.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        LOG.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        LOG.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        LOG.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        LOG.info('SNS repair status is received: %s', status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        LOG.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        LOG.info('SNS rebalance status is received: %s',
                                 status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        LOG.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        LOG.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        LOG.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        LOG.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        LOG.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        LOG.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        LOG.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        LOG.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)

                    else:
                        LOG.warning('Unsupported event type received: %s',
                                    item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    LOG.exception('**ERROR**')
        except StopIteration:
            ffi.shun_motr_thread()
        finally:
            LOG.info('Handler thread has exited')
Exemple #6
0
class ConsumerThread(StoppableThread):
    """
    The only Motr-aware thread in whole HaX. This thread pulls messages from
    the multithreaded Queue and considers the messages as commands. Every such
    a command describes what should be sent to Motr land.

    The thread exits gracefully when it receives message of type Die (i.e.
    it is a 'poison pill').
    """

    def __init__(self, planner: WorkPlanner, motr: Motr,
                 herald: DeliveryHerald, consul: ConsulUtil, idx: int):
        super().__init__(target=self._do_work,
                         name=f'qconsumer-{idx}',
                         args=(planner, motr))
        self.is_stopped = False
        self.consul = consul
        self.eq_publisher = EQPublisher()
        self.herald = herald
        self.idx = idx

    def stop(self) -> None:
        self.is_stopped = True

    @repeat_if_fails(wait_seconds=1)
    def _update_process_status(self, p: WorkPlanner, motr: Motr,
                               event: ConfHaProcess) -> None:
        LOG.info('Updating process status: %s', event.fid)
        # If a consul-related exception appears, it will
        # be processed by repeat_if_fails.
        #
        # This thread will become blocked until that
        # intermittent error gets resolved.
        motr_to_svc_status = {
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED)}
        if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED,
                               m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
            svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)]
            broadcast_hax_only = False
            if ((event.chp_type ==
                 m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS) or
               (event.fid == self.consul.get_hax_fid())):
                # Motr-mkfs processes do not require updates on their peer
                # mkfs processes. Motr-mkfs is an independent and typically a
                # one-time operation. So avoid broadcasting a motr-mkfs state
                # to the peer motr-mkfs processes but hax still needs to be
                # notified in-order to disconnect the hax-motr halink when
                # motr-mkfs process stops.
                broadcast_hax_only = True

            LOG.debug('chp_type %d broadcast_hax_only %s', event.chp_type,
                      broadcast_hax_only)
            motr.broadcast_ha_states(
                [HAState(fid=event.fid, status=svc_status)],
                broadcast_hax_only=broadcast_hax_only)
        self.consul.update_process_status(event)

        # If we are receiving M0_CONF_HA_PROCESS_STARTED for M0D processes
        # then we will check if all the M0D processes on the local node are
        # started. If yes then we are going to send node online event to
        # MessageBus
        if event.chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED:
            try:
                util: ConsulUtil = ConsulUtil()
                producer = get_producer(util)
                if producer:
                    producer.check_and_send(parent_resource_type=ObjT.NODE,
                                            fid=event.fid,
                                            resource_status='online')
                else:
                    LOG.warning('Could not sent an event as producer'
                                ' is not available')
            except Exception as e:
                LOG.warning("Send event failed due to '%s'", e)

    @repeat_if_fails(wait_seconds=1)
    def update_process_failure(self, planner: WorkPlanner,
                               ha_states: List[HAState]) -> List[HAState]:
        new_ha_states: List[HAState] = []
        proc_Health_to_status = {
            ObjHealth.OFFLINE: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED,
            ObjHealth.FAILED: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED,
            ObjHealth.OK: m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
        }
        try:
            for state in ha_states:
                if state.fid.container == ObjT.PROCESS.value:
                    current_status = self.consul.get_process_current_status(
                        state.status, state.fid)
                    if current_status == ObjHealth.UNKNOWN:
                        continue
                    proc_status_remote = self.consul.get_process_status(
                                             state.fid)
                    proc_status: Any = None
                    # MKFS states are upated by the node corresponding to a
                    # given process. So we ignore notifications for mkfs
                    # processes.
                    if proc_status_remote.proc_type in (
                            'Unknown',
                            m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS.name):
                        continue
                    proc_type = m0HaProcessType.str_to_Enum(
                         proc_status_remote.proc_type)
                    # Following cases are handled here,
                    # 1. Delayed consul service failure notification:
                    # -  We re-confirm the current process state before
                    #     notifying the process as offline/failed.
                    # 2. Consul reported process failure, current process
                    #    state is offline (this means the corresponding node
                    #    is online, i.e. hax and consul are online):
                    # -  So process's status in consul kv might not be updated
                    #    as the process died abruptly. In this case we handle
                    #    it as local process failure, update the process
                    #    status in consul kv and notify motr.
                    # 3. Consul reported process failure, current process
                    #    state is failed (this means the node corresponding to
                    #    the process also failed, i.e. hax and consul are no
                    #    more):
                    # -  Process's status in consul kv might not be updated as
                    #    the node went down abruptly. In this case, when
                    #    consul reports failure for corresponding node
                    #    processes, Hare verifies the node status and
                    #    accordingly Hare RC node processes the failures.
                    #    This may take some time if Consul server loose
                    #    the quorum and take time sync up the state.
                    # 4. Consul reported process failure, probably due to mkfs
                    #    process completion (m0tr mkfs and m0ds share the same
                    #    fid). which got delayed and process has starting now:
                    # -  Hare checks the current status of the process but it
                    #    is possible that the process state is not synced up
                    #    yet within the quorum. In this case, we continue
                    #    processing the failure event but once the process
                    #    starts successfully Hare will update and notify the
                    #    process state eventually.
                    # 5. For some reason Consul may report a process as
                    #    offline and subsequently report it as online, this
                    #    may happen due to intermittent monitor failure:
                    # -  Hare must handle the change in process states
                    #    accordingly in-order to maintain the eventual
                    #    consistency of the cluster state.
                    proc_status = proc_Health_to_status.get(current_status)
                    LOG.debug('current_status: %s proc_status_remote: %s',
                              current_status, proc_status_remote.proc_status)
                    if proc_status is not None:
                        LOG.debug('proc_status: %s', proc_status.name)
                        if proc_status_remote.proc_status != proc_status.name:
                            if (self.consul.am_i_rc() or
                                    self.consul.is_proc_local(state.fid)):
                                # Probably process node failed, in such a
                                # case, only RC must be allowed to update
                                # the process's persistent state.
                                # Or, if the node's alive then allow the node
                                # to update the local process's state.
                                self.consul.update_process_status(
                                    ConfHaProcess(chp_event=proc_status,
                                                  chp_type=proc_type,
                                                  chp_pid=0,
                                                  fid=state.fid))
                            # RC or not RC, i.e. even without persistent state
                            # update, it is important that the notification to
                            # local motr processes must still be sent.
                            new_ha_states.append(
                                HAState(fid=state.fid, status=current_status))
                        if not self.consul.is_proc_local(state.fid):
                            proc_status_local = (
                                self.consul.get_process_status_local(
                                    state.fid))
                            # Consul monitors a process every 1 second and
                            # this notification is sent to every node. Thus
                            # to avoid notifying about a process multiple
                            # times about the same status every node
                            # maintains a local copy of the remote process
                            # status, which is checked everytime a consul
                            # notification is received and accordingly
                            # the status is notified locally to all the local
                            # motr processes.
                            if (proc_status_local.proc_status !=
                                    proc_status.name):
                                self.consul.update_process_status_local(
                                    ConfHaProcess(chp_event=proc_status,
                                                  chp_type=proc_type,
                                                  chp_pid=0,
                                                  fid=state.fid))
                                new_ha_states.append(
                                    HAState(fid=state.fid,
                                            status=current_status))
                        else:
                            continue
                else:
                    new_ha_states.append(state)
        except Exception as e:
            raise HAConsistencyException('failed to process ha states') from e
        return new_ha_states

    def _do_work(self, planner: WorkPlanner, motr: Motr):
        LOG.info('Handler thread has started')

        try:
            while True:
                try:
                    LOG.debug('Waiting for the next message')

                    item = planner.get_next_command()

                    LOG.debug('Got %s message from planner', item)
                    if isinstance(item, FirstEntrypointRequest):
                        motr.send_entrypoint_request_reply(
                            EntrypointRequest(
                                reply_context=item.reply_context,
                                req_id=item.req_id,
                                remote_rpc_endpoint=item.remote_rpc_endpoint,
                                process_fid=item.process_fid,
                                git_rev=item.git_rev,
                                pid=item.pid,
                                is_first_request=item.is_first_request))
                    elif isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        self._update_process_status(planner, motr, item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, HaNvecSetEvent):
                        fn = motr.ha_nvec_set_process
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        LOG.info('HA states: %s', item.states)
                        ha_states = self.update_process_failure(
                            planner, item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            ha_states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        LOG.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        LOG.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        LOG.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        LOG.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        LOG.info('SNS repair status is received: %s', status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        LOG.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        LOG.info('SNS rebalance status is received: %s',
                                 status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        LOG.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        LOG.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        LOG.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        LOG.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        LOG.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        LOG.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        LOG.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        LOG.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)
                    elif isinstance(item, Die):
                        raise StopIteration()
                    else:
                        LOG.warning('Unsupported event type received: %s',
                                    item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    LOG.exception('**ERROR**')
                finally:
                    planner.notify_finished(item)
        except StopIteration:
            LOG.info('Consumer Stopped')
            if self.idx == 0:
                motr.stop()
        finally:
            LOG.info('Handler thread has exited')
Exemple #7
0
class ConsumerThread(StoppableThread):
    """
    The only Motr-aware thread in whole HaX. This thread pulls messages from
    the multithreaded Queue and considers the messages as commands. Every such
    a command describes what should be sent to Motr land.

    The thread exits gracefully when it receives message of type Die (i.e.
    it is a 'poison pill').
    """
    def __init__(self, planner: WorkPlanner, motr: Motr,
                 herald: DeliveryHerald, consul: ConsulUtil, idx: int):
        super().__init__(target=self._do_work,
                         name=f'qconsumer-{idx}',
                         args=(planner, motr))
        self.is_stopped = False
        self.consul = consul
        self.eq_publisher = EQPublisher()
        self.herald = herald
        self.idx = idx

    def stop(self) -> None:
        self.is_stopped = True

    @repeat_if_fails(wait_seconds=1)
    def _update_process_status(self, p: WorkPlanner, motr: Motr,
                               event: ConfHaProcess) -> None:
        # If a consul-related exception appears, it will
        # be processed by repeat_if_fails.
        #
        # This thread will become blocked until that
        # intermittent error gets resolved.
        motr_to_svc_status = {
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
            (ServiceHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
            (ServiceHealth.OFFLINE),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
            (ServiceHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
            (ServiceHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
            (ServiceHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
            (ServiceHealth.FAILED)
        }
        self.consul.update_process_status(event)
        if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED,
                               m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
            svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)]
            motr.broadcast_ha_states(
                [HAState(fid=event.fid, status=svc_status)])

    @repeat_if_fails(wait_seconds=1)
    def update_process_failure(self, planner: WorkPlanner,
                               ha_states: List[HAState]) -> List[HAState]:
        new_ha_states: List[HAState] = []
        for state in ha_states:
            # We are only concerned with process statuses.
            if state.fid.container == ObjT.PROCESS.value:
                current_status = self.consul.get_process_current_status(
                    state.status, state.fid)
                if current_status == ServiceHealth.OK:
                    if (self.consul.get_process_local_status(
                            state.fid) == 'M0_CONF_HA_PROCESS_STARTED'):
                        continue
                if current_status in (ServiceHealth.FAILED,
                                      ServiceHealth.STOPPED):
                    if (self.consul.get_process_local_status(
                            state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'):
                        # Consul may report failure of a process multiple
                        # times, so we don't want to send duplicate failure
                        # notifications, it may cause delay in cleanup
                        # activities.
                        continue
                # XXX:
                # Sometime, there can be situation where Consul event is sent
                # and can be delayed, where state reported by Consul for a
                # given process can be in its past already, e.g. consul
                # reported process failure but when hax received the event,
                # process might have already restarted. In this case the event
                # still needs to be handled. Also, it is possible that Consul
                # reported failure but process status is not yet updated in
                # Consul services catalog, in such a case the reported status
                # can be true and cannot be just dropped. These scenarios must
                # be re-visited.
                if current_status not in (ServiceHealth.UNKNOWN,
                                          ServiceHealth.OFFLINE):
                    # We also need to account and report the failure of remote
                    # Motr processes to this node's hax and motr processes.
                    # When Consul reports a remote process failure, hax
                    # confirms its current status from Consul KV and updates
                    # the list of failed services and also adds it to the
                    # broadcast list.
                    if current_status != ServiceHealth.OK:
                        event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
                    else:
                        event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
                    self.consul.update_process_status(
                        ConfHaProcess(
                            chp_event=event,
                            chp_type=int(
                                m0HaProcessType.M0_CONF_HA_PROCESS_M0D),
                            chp_pid=0,
                            fid=state.fid))
                new_ha_states.append(
                    HAState(fid=state.fid, status=current_status))
            else:
                new_ha_states.append(state)
        return new_ha_states

    def _do_work(self, planner: WorkPlanner, motr: Motr):
        LOG.info('Handler thread has started')
        motr.adopt_motr_thread()

        try:
            while True:
                try:
                    LOG.debug('Waiting for the next message')

                    item = planner.get_next_command()

                    LOG.debug('Got %s message from planner', item)
                    if isinstance(item, FirstEntrypointRequest):
                        motr.send_entrypoint_request_reply(
                            EntrypointRequest(
                                reply_context=item.reply_context,
                                req_id=item.req_id,
                                remote_rpc_endpoint=item.remote_rpc_endpoint,
                                process_fid=item.process_fid,
                                git_rev=item.git_rev,
                                pid=item.pid,
                                is_first_request=item.is_first_request))
                    elif isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        self._update_process_status(planner, motr, item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        LOG.info('HA states: %s', item.states)
                        ha_states = self.update_process_failure(
                            planner, item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            ha_states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        LOG.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        LOG.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        LOG.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        LOG.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        LOG.info('SNS repair status is received: %s', status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        LOG.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        LOG.info('SNS rebalance status is received: %s',
                                 status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        LOG.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        LOG.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        LOG.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        LOG.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        LOG.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        LOG.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        LOG.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        LOG.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)
                    elif isinstance(item, Die):
                        raise StopIteration()
                    else:
                        LOG.warning('Unsupported event type received: %s',
                                    item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    LOG.exception('**ERROR**')
                finally:
                    planner.notify_finished(item)
        except StopIteration:
            LOG.info('Consumer Stopped')
            if self.idx == 0:
                motr.stop()
            motr.shun_motr_thread()
        finally:
            LOG.info('Handler thread has exited')
Exemple #8
0
class ConsumerThread(StoppableThread):
    """
    The only Motr-aware thread in whole HaX. This thread pulls messages from
    the multithreaded Queue and considers the messages as commands. Every such
    a command describes what should be sent to Motr land.

    The thread exits gracefully when it receives message of type Die (i.e.
    it is a 'poison pill').
    """
    def __init__(self, q: Queue, motr: Motr):
        super().__init__(target=self._do_work,
                         name='qconsumer',
                         args=(q, motr))
        self.is_stopped = False
        self.consul = ConsulUtil()
        self.eq_publisher = EQPublisher()

    def stop(self) -> None:
        self.is_stopped = True

    def _do_work(self, q: Queue, motr: Motr):
        ffi = motr._ffi
        logging.info('Handler thread has started')
        ffi.adopt_motr_thread()

        def pull_msg():
            try:
                return q.get(block=False)
            except Empty:
                return None

        try:
            while True:
                try:
                    logging.debug('Waiting for the next message')

                    item = pull_msg()
                    while item is None:
                        time.sleep(0.2)
                        if self.is_stopped:
                            raise StopIteration()
                        item = pull_msg()

                    logging.debug('Got %s message from queue', item)
                    if isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        fn = self.consul.update_process_status
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        logging.info('HA states: %s', item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            item.states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        logging.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        logging.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        logging.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        logging.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        logging.info('SNS repair status is received: %s',
                                     status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        logging.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        logging.info('SNS rebalance status is received: %s',
                                     status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        logging.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        logging.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        logging.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        logging.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        logging.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        logging.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        logging.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        logging.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)

                    else:
                        logging.warning('Unsupported event type received: %s',
                                        item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    logging.exception('**ERROR**')
        except StopIteration:
            ffi.shun_motr_thread()
        finally:
            logging.info('Handler thread has exited')