def start_rebalance(self, pool_fid: Fid):
     LOG.debug('Initiating rebalance for pool %s', pool_fid)
     result: int = self._ffi.start_rebalance(self._ha_ctx, pool_fid.to_c())
     if result:
         raise RepairRebalanceException(
             'Failed to send SPIEL request "sns_rebalance_start",' +
             'please check Motr logs for more details.')
     LOG.debug('Rebalancing started for pool %s', pool_fid)
 def get_rebalance_status(self, pool_fid: Fid) -> List[ReprebStatus]:
     LOG.debug('Fetching rebalance status for pool %s', pool_fid)
     status: List[ReprebStatus] = self._ffi.rebalance_status(
         self._ha_ctx, pool_fid.to_c())
     if status is None:
         raise RepairRebalanceException('rebalance status unavailable')
     LOG.debug('rebalance status for pool %s: %s', pool_fid, status)
     return status
Example #3
0
 def resume_repair(self, pool_fid: Fid):
     LOG.debug('Resuming repair for pool %s', pool_fid)
     result: int = self._ffi.resume_repair(self._ha_ctx, pool_fid.to_c())
     if result:
         raise RepairRebalanceException(
             'Failed to send SPIEL request "sns_repair_resume",'
             'please check Motr logs for more details.')
     LOG.debug('Repairing resumed for pool %s', pool_fid)
Example #4
0
 def stop_repair(self, pool_fid: Fid):
     LOG.debug('Stopping repair for pool %s', pool_fid)
     result: int = self._ffi.stop_repair(self._ha_ctx, pool_fid.to_c())
     if result:
         raise RepairRebalanceException(
             'Failed to send SPIEL request "sns_repair_stop", please' +
             ' check Motr logs for more details.')
     LOG.debug('Repairing stoped for pool %s', pool_fid)
Example #5
0
    def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid,
              profile: Profile):
        LOG.debug('Starting m0_halon_interface')
        self._process_fid = process
        self._profile = profile

        @repeat_if_fails()
        def _get_rm_fid() -> Fid:
            return self.consul_util.get_rm_fid()

        rm_fid = _get_rm_fid()
        result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint),
                                 process.to_c(), ha_service.to_c(),
                                 rm_fid.to_c())
        if result:
            LOG.error(
                'Cannot start Motr API. m0_halon_interface::start'
                ' returned non-zero code (%s)', result)
            raise RuntimeError('Cannot start m0_halon_interface.'
                               'Please check Motr logs for more details.')
Example #6
0
    def send_entrypoint_request_reply(self, message: EntrypointRequest):
        reply_context = message.reply_context
        req_id = message.req_id
        remote_rpc_endpoint = message.remote_rpc_endpoint
        process_fid = message.process_fid
        e_rc = EAGAIN

        LOG.debug('Processing entrypoint request from remote endpoint'
                  " '{}', process fid {}".format(remote_rpc_endpoint,
                                                 str(process_fid)))
        sess = principal_rm = confds = None
        try:
            util = self.consul_util
            # When stopping, there's a possibility that hax may receive
            # an entrypoint request from motr land. In order to unblock
            # motr land, reply with entrypoint request with no confds
            # and RM endpoints as the processes might have already
            # stopped.
            rc_quorum = 0
            rm_fid = Fid(0, 0)
            if self.is_stopping:
                confds = []
            else:
                sess = util.get_leader_session()
                principal_rm = util.get_session_node(sess)
                confds = util.get_confd_list()

            # Hax may receive entrypoint requests multiple times during its
            # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr
            # rconfc establishes connection with principal RM, in case of
            # principal RM failure, rconfc invalidates its confc and again
            # requests entrypoint in a hope that there will be another confd
            # and principal RM elected so that rconfc can resume its
            # functionality. During shutdown, when each motr process stops,
            # including confds, hax broadcasts M0_NC_FAILED event for every
            # STOPPED or FAILED motr process. Motr rconfc on receiving the
            # failed events for confds, goes re-requests entrypoint information
            # and this goes on in a loop. In order to break this loop, the
            # the entrypoint reply must only report alive confds and rm
            # endpoints. While doing this we need to handle the bootstrapping
            # case, so we wait until bootstrapping is done that is all the
            # motr services are up, we check the confd status and exclude
            # corresponding confd from the entrypoint reply.

            # EOS-25726: It seems that the confds were reported as started
            # and they failed later. This could be due to a Motr issue
            # EOS-25695.
            # In such a case, when processes start out of order, a wrong
            # quorum value is reported that leads to further issues in Motr
            # process startup. Thus commenting this for now. Need to verify
            # if this affects hax shutdown.
            # active_confds = []
            # if self.spiel_ready:
            #     for confd in confds:
            #         if not util.is_confd_failed(confd.fid):
            #             active_confds.append(confd)
            #     confds = active_confds

            if confds:
                rm_fid = util.get_rm_fid()
                rc_quorum = int(len(confds) / 2 + 1)
            rm_eps = None
            for svc in confds:
                if svc.node == principal_rm:
                    rm_eps = svc.address
                    break
            if confds and (not self.is_stopping) and (not rm_eps):
                if util.m0ds_stopping():
                    e_rc = 0
                raise RuntimeError('No RM node found in Consul')
        except Exception:
            LOG.exception('Failed to get the data from Consul.'
                          ' Replying with EAGAIN error code, with a 1'
                          ' second delay.')
            # If replied EAGAIN, motr immediately sends a subsequent entrypoint
            # request and it is observed that several entrypoint requests are
            # received by hare in a second. This floods Hare, as an
            # intermediate solution, Hare dropped the requests in case of an
            # error preparing the same. But, motr does not send any subsequent
            # entrypoint requests as expected after a timeout. As per the
            # discussion, it is agreed upon to have a temporary fix in Hare.
            # https://jts.seagate.com/browse/EOS-27068 motr ticket is created
            # to track the same.
            sleep(1)
            self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0,
                                       make_array(FidStruct, []),
                                       make_array(c.c_char_p, []), 0,
                                       Fid(0, 0).to_c(), None)
            LOG.debug('Reply sent')
            return

        confd_fids = [x.fid.to_c() for x in confds]
        confd_eps = [make_c_str(x.address) for x in confds]

        LOG.debug('Passing the entrypoint reply to hax.c layer')
        self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0,
                                   len(confds),
                                   make_array(FidStruct, confd_fids),
                                   make_array(c.c_char_p,
                                              confd_eps), rc_quorum,
                                   rm_fid.to_c(), make_c_str(rm_eps))
        LOG.debug('Entrypoint request has been replied to')
Example #7
0
    def send_entrypoint_request_reply(self, message: EntrypointRequest):
        reply_context = message.reply_context
        req_id = message.req_id
        remote_rpc_endpoint = message.remote_rpc_endpoint
        process_fid = message.process_fid
        e_rc = EAGAIN

        LOG.debug('Processing entrypoint request from remote endpoint'
                  " '{}', process fid {}".format(remote_rpc_endpoint,
                                                 str(process_fid)))
        sess = principal_rm = confds = None
        try:
            util = self.consul_util
            # When stopping, there's a possibility that hax may receive
            # an entrypoint request from motr land. In order to unblock
            # motr land, reply with entrypoint request with no confds
            # and RM endpoints as the processes might have already
            # stopped.
            rc_quorum = 0
            rm_fid = Fid(0, 0)
            if self.is_stopping:
                confds = []
            else:
                sess = util.get_leader_session_no_wait()
                principal_rm = util.get_session_node(sess)
                confds = util.get_confd_list()

            # Hax may receive entrypoint requests multiple times during its
            # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr
            # rconfc establishes connection with principal RM, in case of
            # principal RM failure, rconfc invalidates its confc and again
            # requests entrypoint in a hope that there will be another confd
            # and principal RM elected so that rconfc can resume its
            # functionality. During shutdown, when each motr process stops,
            # including confds, hax broadcasts M0_NC_FAILED event for every
            # STOPPED or FAILED motr process. Motr rconfc on receiving the
            # failed events for confds, goes re-requests entrypoint information
            # and this goes on in a loop. In order to break this loop, the
            # the entrypoint reply must only report alive confds and rm
            # endpoints. While doing this we need to handle the bootstrapping
            # case, so we wait until bootstrapping is done that is all the
            # motr services are up, we check the confd status and exclude
            # corresponding confd from the entrypoint reply.
            active_confds = []
            if self.spiel_ready:
                for confd in confds:
                    if not util.is_confd_failed(confd.fid):
                        active_confds.append(confd)
                confds = active_confds

            if confds:
                rm_fid = util.get_rm_fid()
                rc_quorum = int(len(confds) / 2 + 1)
            rm_eps = None
            for svc in confds:
                if svc.node == principal_rm:
                    rm_eps = svc.address
                    break
            if confds and (not self.is_stopping) and (not rm_eps):
                if util.m0ds_stopping():
                    e_rc = 0
                raise RuntimeError('No RM node found in Consul')
        except Exception:
            LOG.exception('Failed to get the data from Consul.'
                          ' Replying with EAGAIN error code.')
            self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0,
                                       make_array(FidStruct, []),
                                       make_array(c.c_char_p, []), 0,
                                       Fid(0, 0).to_c(), None)
            LOG.debug('Reply sent')
            return

        confd_fids = [x.fid.to_c() for x in confds]
        confd_eps = [make_c_str(x.address) for x in confds]

        LOG.debug('Passing the entrypoint reply to hax.c layer')
        self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0,
                                   len(confds),
                                   make_array(FidStruct, confd_fids),
                                   make_array(c.c_char_p,
                                              confd_eps), rc_quorum,
                                   rm_fid.to_c(), make_c_str(rm_eps))
        LOG.debug('Entrypoint request has been replied to')