Beispiel #1
0
 def notify_hax_stop(self):
     LOG.debug('Notifying hax stop')
     hax_fid = self.consul_util.get_hax_fid()
     hax_endpoint = self.consul_util.get_hax_endpoint()
     ids = self._ffi.hax_stop(self._ha_ctx, hax_fid.to_c(),
                              make_c_str(hax_endpoint))
     self.herald.wait_for_all(HaLinkMessagePromise(ids))
Beispiel #2
0
    def _ha_broadcast(self, notes: List[HaNoteStruct],
                      broadcast_hax_only: bool) -> List[MessageId]:
        message_ids: List[MessageId] = []
        nr_notes_to_be_sent = len(notes)
        notes_sent = 0
        LOG.debug('Broadcasting %d notes', nr_notes_to_be_sent)
        while notes:
            notes_to_send = notes[0:MAX_MOTR_NVEC_UPDATE_SZ]
            notes_to_send_len = len(notes_to_send)
            notes_sent += notes_to_send_len
            if broadcast_hax_only:
                hax_endpoint = self.consul_util.get_hax_endpoint()
                message_ids = self._ffi.ha_broadcast_hax_only(
                    self._ha_ctx, make_array(HaNoteStruct, notes_to_send),
                    notes_to_send_len, make_c_str(hax_endpoint))
            else:
                message_ids = self._ffi.ha_broadcast(
                    self._ha_ctx, make_array(HaNoteStruct, notes_to_send),
                    notes_to_send_len)
            LOG.debug('Broadcast HA state complete, message_ids = %s',
                      message_ids)
            notes = notes[MAX_MOTR_NVEC_UPDATE_SZ:]
        assert notes_sent == nr_notes_to_be_sent

        return message_ids
Beispiel #3
0
    def send_entrypoint_request_reply(self, message: EntrypointRequest):
        reply_context = message.reply_context
        req_id = message.req_id
        remote_rpc_endpoint = message.remote_rpc_endpoint
        process_fid = message.process_fid

        LOG.debug('Processing entrypoint request from remote endpoint'
                  " '{}', process fid {}".format(remote_rpc_endpoint,
                                                 str(process_fid)))
        sess = principal_rm = confds = None
        try:
            util = self.consul_util
            sess = util.get_leader_session_no_wait()
            principal_rm = util.get_session_node(sess)
            confds = util.get_confd_list()
            rm_fid = util.get_rm_fid()
        except Exception:
            LOG.exception('Failed to get the data from Consul.'
                          ' Replying with EAGAIN error code.')
            self._ffi.entrypoint_reply(reply_context, req_id.to_c(), EAGAIN, 0,
                                       make_array(FidStruct, []),
                                       make_array(c.c_char_p, []), 0,
                                       Fid(0, 0).to_c(), None)
            LOG.debug('Reply sent')
            return

        rc_quorum = int(len(confds) / 2 + 1)

        rm_eps = None
        for svc in confds:
            if svc.node == principal_rm:
                rm_eps = svc.address
                break
        if not rm_eps:
            raise RuntimeError('No RM node found in Consul')

        confd_fids = [x.fid.to_c() for x in confds]
        confd_eps = [make_c_str(x.address) for x in confds]

        LOG.debug('Passing the entrypoint reply to hax.c layer')
        self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0,
                                   len(confds),
                                   make_array(FidStruct, confd_fids),
                                   make_array(c.c_char_p,
                                              confd_eps), rc_quorum,
                                   rm_fid.to_c(), make_c_str(rm_eps))
        LOG.debug('Entrypoint request has been replied to')
Beispiel #4
0
 def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid,
           rm_service: Fid):
     LOG.debug('Starting m0_halon_interface')
     self._process_fid = process
     result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint),
                              process.to_c(), ha_service.to_c(),
                              rm_service.to_c())
     if result:
         LOG.error(
             'Cannot start Motr API. m0_halon_interface::start'
             ' returned non-zero code (%s)', result)
         raise RuntimeError('Cannot start m0_halon_interface.'
                            'Please check Motr logs for more details.')
Beispiel #5
0
    def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
        LOG.info('fid=%s, chp_event=%s', fid, chp_event)
        self.planner.add_command(
            ProcessEvent(
                ConfHaProcess(chp_event=chp_event,
                              chp_type=chp_type,
                              chp_pid=chp_pid,
                              fid=fid)))

        if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED:
            proc_ep = self.consul_util.fid_to_endpoint(fid)
            if proc_ep:
                self._ffi.hax_link_stopped(self._ha_ctx, make_c_str(proc_ep))
Beispiel #6
0
    def __init__(self,
                 ffi: HaxFFI,
                 queue,
                 rm_fid: Fid,
                 herald: DeliveryHerald,
                 node_uuid: str = ''):
        self._ffi = ffi or HaxFFI()
        # [KN] Note that node_uuid is currently ignored by the corresponding
        # hax.c function
        self._ha_ctx = self._ffi.init_motr_api(self, make_c_str(node_uuid))
        self.queue = queue
        self.rm_fid = rm_fid
        self.herald = herald

        if not self._ha_ctx:
            LOG.error('Cannot initialize Motr API. m0_halon_interface_init'
                      ' returned 0')
            raise RuntimeError('Cannot initialize Motr API')
Beispiel #7
0
    def __init__(self,
                 ffi: HaxFFI,
                 planner: WorkPlanner,
                 herald: DeliveryHerald,
                 consul_util: ConsulUtil,
                 node_uuid: str = ''):
        self._ffi = ffi or HaxFFI()
        # [KN] Note that node_uuid is currently ignored by the corresponding
        # hax.c function
        self._ha_ctx = self._ffi.init_motr_api(self, make_c_str(node_uuid))
        self.planner = planner
        self.herald = herald
        self.consul_util = consul_util
        self.spiel_ready = False
        self.is_stopping = False

        if not self._ha_ctx:
            LOG.error('Cannot initialize Motr API. m0_halon_interface_init'
                      ' returned 0')
            raise RuntimeError('Cannot initialize Motr API')
Beispiel #8
0
    def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid,
              profile: Profile):
        LOG.debug('Starting m0_halon_interface')
        self._process_fid = process
        self._profile = profile

        @repeat_if_fails()
        def _get_rm_fid() -> Fid:
            return self.consul_util.get_rm_fid()

        rm_fid = _get_rm_fid()
        result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint),
                                 process.to_c(), ha_service.to_c(),
                                 rm_fid.to_c())
        if result:
            LOG.error(
                'Cannot start Motr API. m0_halon_interface::start'
                ' returned non-zero code (%s)', result)
            raise RuntimeError('Cannot start m0_halon_interface.'
                               'Please check Motr logs for more details.')
Beispiel #9
0
    def send_entrypoint_request_reply(self, message: EntrypointRequest):
        reply_context = message.reply_context
        req_id = message.req_id
        remote_rpc_endpoint = message.remote_rpc_endpoint
        process_fid = message.process_fid
        e_rc = EAGAIN

        LOG.debug('Processing entrypoint request from remote endpoint'
                  " '{}', process fid {}".format(remote_rpc_endpoint,
                                                 str(process_fid)))
        sess = principal_rm = confds = None
        try:
            util = self.consul_util
            # When stopping, there's a possibility that hax may receive
            # an entrypoint request from motr land. In order to unblock
            # motr land, reply with entrypoint request with no confds
            # and RM endpoints as the processes might have already
            # stopped.
            rc_quorum = 0
            rm_fid = Fid(0, 0)
            if self.is_stopping:
                confds = []
            else:
                sess = util.get_leader_session()
                principal_rm = util.get_session_node(sess)
                confds = util.get_confd_list()

            # Hax may receive entrypoint requests multiple times during its
            # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr
            # rconfc establishes connection with principal RM, in case of
            # principal RM failure, rconfc invalidates its confc and again
            # requests entrypoint in a hope that there will be another confd
            # and principal RM elected so that rconfc can resume its
            # functionality. During shutdown, when each motr process stops,
            # including confds, hax broadcasts M0_NC_FAILED event for every
            # STOPPED or FAILED motr process. Motr rconfc on receiving the
            # failed events for confds, goes re-requests entrypoint information
            # and this goes on in a loop. In order to break this loop, the
            # the entrypoint reply must only report alive confds and rm
            # endpoints. While doing this we need to handle the bootstrapping
            # case, so we wait until bootstrapping is done that is all the
            # motr services are up, we check the confd status and exclude
            # corresponding confd from the entrypoint reply.

            # EOS-25726: It seems that the confds were reported as started
            # and they failed later. This could be due to a Motr issue
            # EOS-25695.
            # In such a case, when processes start out of order, a wrong
            # quorum value is reported that leads to further issues in Motr
            # process startup. Thus commenting this for now. Need to verify
            # if this affects hax shutdown.
            # active_confds = []
            # if self.spiel_ready:
            #     for confd in confds:
            #         if not util.is_confd_failed(confd.fid):
            #             active_confds.append(confd)
            #     confds = active_confds

            if confds:
                rm_fid = util.get_rm_fid()
                rc_quorum = int(len(confds) / 2 + 1)
            rm_eps = None
            for svc in confds:
                if svc.node == principal_rm:
                    rm_eps = svc.address
                    break
            if confds and (not self.is_stopping) and (not rm_eps):
                if util.m0ds_stopping():
                    e_rc = 0
                raise RuntimeError('No RM node found in Consul')
        except Exception:
            LOG.exception('Failed to get the data from Consul.'
                          ' Replying with EAGAIN error code, with a 1'
                          ' second delay.')
            # If replied EAGAIN, motr immediately sends a subsequent entrypoint
            # request and it is observed that several entrypoint requests are
            # received by hare in a second. This floods Hare, as an
            # intermediate solution, Hare dropped the requests in case of an
            # error preparing the same. But, motr does not send any subsequent
            # entrypoint requests as expected after a timeout. As per the
            # discussion, it is agreed upon to have a temporary fix in Hare.
            # https://jts.seagate.com/browse/EOS-27068 motr ticket is created
            # to track the same.
            sleep(1)
            self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0,
                                       make_array(FidStruct, []),
                                       make_array(c.c_char_p, []), 0,
                                       Fid(0, 0).to_c(), None)
            LOG.debug('Reply sent')
            return

        confd_fids = [x.fid.to_c() for x in confds]
        confd_eps = [make_c_str(x.address) for x in confds]

        LOG.debug('Passing the entrypoint reply to hax.c layer')
        self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0,
                                   len(confds),
                                   make_array(FidStruct, confd_fids),
                                   make_array(c.c_char_p,
                                              confd_eps), rc_quorum,
                                   rm_fid.to_c(), make_c_str(rm_eps))
        LOG.debug('Entrypoint request has been replied to')
Beispiel #10
0
    def send_entrypoint_request_reply(self, message: EntrypointRequest):
        reply_context = message.reply_context
        req_id = message.req_id
        remote_rpc_endpoint = message.remote_rpc_endpoint
        process_fid = message.process_fid
        e_rc = EAGAIN

        LOG.debug('Processing entrypoint request from remote endpoint'
                  " '{}', process fid {}".format(remote_rpc_endpoint,
                                                 str(process_fid)))
        sess = principal_rm = confds = None
        try:
            util = self.consul_util
            # When stopping, there's a possibility that hax may receive
            # an entrypoint request from motr land. In order to unblock
            # motr land, reply with entrypoint request with no confds
            # and RM endpoints as the processes might have already
            # stopped.
            rc_quorum = 0
            rm_fid = Fid(0, 0)
            if self.is_stopping:
                confds = []
            else:
                sess = util.get_leader_session_no_wait()
                principal_rm = util.get_session_node(sess)
                confds = util.get_confd_list()

            # Hax may receive entrypoint requests multiple times during its
            # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr
            # rconfc establishes connection with principal RM, in case of
            # principal RM failure, rconfc invalidates its confc and again
            # requests entrypoint in a hope that there will be another confd
            # and principal RM elected so that rconfc can resume its
            # functionality. During shutdown, when each motr process stops,
            # including confds, hax broadcasts M0_NC_FAILED event for every
            # STOPPED or FAILED motr process. Motr rconfc on receiving the
            # failed events for confds, goes re-requests entrypoint information
            # and this goes on in a loop. In order to break this loop, the
            # the entrypoint reply must only report alive confds and rm
            # endpoints. While doing this we need to handle the bootstrapping
            # case, so we wait until bootstrapping is done that is all the
            # motr services are up, we check the confd status and exclude
            # corresponding confd from the entrypoint reply.
            active_confds = []
            if self.spiel_ready:
                for confd in confds:
                    if not util.is_confd_failed(confd.fid):
                        active_confds.append(confd)
                confds = active_confds

            if confds:
                rm_fid = util.get_rm_fid()
                rc_quorum = int(len(confds) / 2 + 1)
            rm_eps = None
            for svc in confds:
                if svc.node == principal_rm:
                    rm_eps = svc.address
                    break
            if confds and (not self.is_stopping) and (not rm_eps):
                if util.m0ds_stopping():
                    e_rc = 0
                raise RuntimeError('No RM node found in Consul')
        except Exception:
            LOG.exception('Failed to get the data from Consul.'
                          ' Replying with EAGAIN error code.')
            self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0,
                                       make_array(FidStruct, []),
                                       make_array(c.c_char_p, []), 0,
                                       Fid(0, 0).to_c(), None)
            LOG.debug('Reply sent')
            return

        confd_fids = [x.fid.to_c() for x in confds]
        confd_eps = [make_c_str(x.address) for x in confds]

        LOG.debug('Passing the entrypoint reply to hax.c layer')
        self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0,
                                   len(confds),
                                   make_array(FidStruct, confd_fids),
                                   make_array(c.c_char_p,
                                              confd_eps), rc_quorum,
                                   rm_fid.to_c(), make_c_str(rm_eps))
        LOG.debug('Entrypoint request has been replied to')