def notify_hax_stop(self): LOG.debug('Notifying hax stop') hax_fid = self.consul_util.get_hax_fid() hax_endpoint = self.consul_util.get_hax_endpoint() ids = self._ffi.hax_stop(self._ha_ctx, hax_fid.to_c(), make_c_str(hax_endpoint)) self.herald.wait_for_all(HaLinkMessagePromise(ids))
def _ha_broadcast(self, notes: List[HaNoteStruct], broadcast_hax_only: bool) -> List[MessageId]: message_ids: List[MessageId] = [] nr_notes_to_be_sent = len(notes) notes_sent = 0 LOG.debug('Broadcasting %d notes', nr_notes_to_be_sent) while notes: notes_to_send = notes[0:MAX_MOTR_NVEC_UPDATE_SZ] notes_to_send_len = len(notes_to_send) notes_sent += notes_to_send_len if broadcast_hax_only: hax_endpoint = self.consul_util.get_hax_endpoint() message_ids = self._ffi.ha_broadcast_hax_only( self._ha_ctx, make_array(HaNoteStruct, notes_to_send), notes_to_send_len, make_c_str(hax_endpoint)) else: message_ids = self._ffi.ha_broadcast( self._ha_ctx, make_array(HaNoteStruct, notes_to_send), notes_to_send_len) LOG.debug('Broadcast HA state complete, message_ids = %s', message_ids) notes = notes[MAX_MOTR_NVEC_UPDATE_SZ:] assert notes_sent == nr_notes_to_be_sent return message_ids
def send_entrypoint_request_reply(self, message: EntrypointRequest): reply_context = message.reply_context req_id = message.req_id remote_rpc_endpoint = message.remote_rpc_endpoint process_fid = message.process_fid LOG.debug('Processing entrypoint request from remote endpoint' " '{}', process fid {}".format(remote_rpc_endpoint, str(process_fid))) sess = principal_rm = confds = None try: util = self.consul_util sess = util.get_leader_session_no_wait() principal_rm = util.get_session_node(sess) confds = util.get_confd_list() rm_fid = util.get_rm_fid() except Exception: LOG.exception('Failed to get the data from Consul.' ' Replying with EAGAIN error code.') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), EAGAIN, 0, make_array(FidStruct, []), make_array(c.c_char_p, []), 0, Fid(0, 0).to_c(), None) LOG.debug('Reply sent') return rc_quorum = int(len(confds) / 2 + 1) rm_eps = None for svc in confds: if svc.node == principal_rm: rm_eps = svc.address break if not rm_eps: raise RuntimeError('No RM node found in Consul') confd_fids = [x.fid.to_c() for x in confds] confd_eps = [make_c_str(x.address) for x in confds] LOG.debug('Passing the entrypoint reply to hax.c layer') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0, len(confds), make_array(FidStruct, confd_fids), make_array(c.c_char_p, confd_eps), rc_quorum, rm_fid.to_c(), make_c_str(rm_eps)) LOG.debug('Entrypoint request has been replied to')
def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid, rm_service: Fid): LOG.debug('Starting m0_halon_interface') self._process_fid = process result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint), process.to_c(), ha_service.to_c(), rm_service.to_c()) if result: LOG.error( 'Cannot start Motr API. m0_halon_interface::start' ' returned non-zero code (%s)', result) raise RuntimeError('Cannot start m0_halon_interface.' 'Please check Motr logs for more details.')
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.planner.add_command( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED: proc_ep = self.consul_util.fid_to_endpoint(fid) if proc_ep: self._ffi.hax_link_stopped(self._ha_ctx, make_c_str(proc_ep))
def __init__(self, ffi: HaxFFI, queue, rm_fid: Fid, herald: DeliveryHerald, node_uuid: str = ''): self._ffi = ffi or HaxFFI() # [KN] Note that node_uuid is currently ignored by the corresponding # hax.c function self._ha_ctx = self._ffi.init_motr_api(self, make_c_str(node_uuid)) self.queue = queue self.rm_fid = rm_fid self.herald = herald if not self._ha_ctx: LOG.error('Cannot initialize Motr API. m0_halon_interface_init' ' returned 0') raise RuntimeError('Cannot initialize Motr API')
def __init__(self, ffi: HaxFFI, planner: WorkPlanner, herald: DeliveryHerald, consul_util: ConsulUtil, node_uuid: str = ''): self._ffi = ffi or HaxFFI() # [KN] Note that node_uuid is currently ignored by the corresponding # hax.c function self._ha_ctx = self._ffi.init_motr_api(self, make_c_str(node_uuid)) self.planner = planner self.herald = herald self.consul_util = consul_util self.spiel_ready = False self.is_stopping = False if not self._ha_ctx: LOG.error('Cannot initialize Motr API. m0_halon_interface_init' ' returned 0') raise RuntimeError('Cannot initialize Motr API')
def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid, profile: Profile): LOG.debug('Starting m0_halon_interface') self._process_fid = process self._profile = profile @repeat_if_fails() def _get_rm_fid() -> Fid: return self.consul_util.get_rm_fid() rm_fid = _get_rm_fid() result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint), process.to_c(), ha_service.to_c(), rm_fid.to_c()) if result: LOG.error( 'Cannot start Motr API. m0_halon_interface::start' ' returned non-zero code (%s)', result) raise RuntimeError('Cannot start m0_halon_interface.' 'Please check Motr logs for more details.')
def send_entrypoint_request_reply(self, message: EntrypointRequest): reply_context = message.reply_context req_id = message.req_id remote_rpc_endpoint = message.remote_rpc_endpoint process_fid = message.process_fid e_rc = EAGAIN LOG.debug('Processing entrypoint request from remote endpoint' " '{}', process fid {}".format(remote_rpc_endpoint, str(process_fid))) sess = principal_rm = confds = None try: util = self.consul_util # When stopping, there's a possibility that hax may receive # an entrypoint request from motr land. In order to unblock # motr land, reply with entrypoint request with no confds # and RM endpoints as the processes might have already # stopped. rc_quorum = 0 rm_fid = Fid(0, 0) if self.is_stopping: confds = [] else: sess = util.get_leader_session() principal_rm = util.get_session_node(sess) confds = util.get_confd_list() # Hax may receive entrypoint requests multiple times during its # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr # rconfc establishes connection with principal RM, in case of # principal RM failure, rconfc invalidates its confc and again # requests entrypoint in a hope that there will be another confd # and principal RM elected so that rconfc can resume its # functionality. During shutdown, when each motr process stops, # including confds, hax broadcasts M0_NC_FAILED event for every # STOPPED or FAILED motr process. Motr rconfc on receiving the # failed events for confds, goes re-requests entrypoint information # and this goes on in a loop. In order to break this loop, the # the entrypoint reply must only report alive confds and rm # endpoints. While doing this we need to handle the bootstrapping # case, so we wait until bootstrapping is done that is all the # motr services are up, we check the confd status and exclude # corresponding confd from the entrypoint reply. # EOS-25726: It seems that the confds were reported as started # and they failed later. This could be due to a Motr issue # EOS-25695. # In such a case, when processes start out of order, a wrong # quorum value is reported that leads to further issues in Motr # process startup. Thus commenting this for now. Need to verify # if this affects hax shutdown. # active_confds = [] # if self.spiel_ready: # for confd in confds: # if not util.is_confd_failed(confd.fid): # active_confds.append(confd) # confds = active_confds if confds: rm_fid = util.get_rm_fid() rc_quorum = int(len(confds) / 2 + 1) rm_eps = None for svc in confds: if svc.node == principal_rm: rm_eps = svc.address break if confds and (not self.is_stopping) and (not rm_eps): if util.m0ds_stopping(): e_rc = 0 raise RuntimeError('No RM node found in Consul') except Exception: LOG.exception('Failed to get the data from Consul.' ' Replying with EAGAIN error code, with a 1' ' second delay.') # If replied EAGAIN, motr immediately sends a subsequent entrypoint # request and it is observed that several entrypoint requests are # received by hare in a second. This floods Hare, as an # intermediate solution, Hare dropped the requests in case of an # error preparing the same. But, motr does not send any subsequent # entrypoint requests as expected after a timeout. As per the # discussion, it is agreed upon to have a temporary fix in Hare. # https://jts.seagate.com/browse/EOS-27068 motr ticket is created # to track the same. sleep(1) self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0, make_array(FidStruct, []), make_array(c.c_char_p, []), 0, Fid(0, 0).to_c(), None) LOG.debug('Reply sent') return confd_fids = [x.fid.to_c() for x in confds] confd_eps = [make_c_str(x.address) for x in confds] LOG.debug('Passing the entrypoint reply to hax.c layer') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0, len(confds), make_array(FidStruct, confd_fids), make_array(c.c_char_p, confd_eps), rc_quorum, rm_fid.to_c(), make_c_str(rm_eps)) LOG.debug('Entrypoint request has been replied to')
def send_entrypoint_request_reply(self, message: EntrypointRequest): reply_context = message.reply_context req_id = message.req_id remote_rpc_endpoint = message.remote_rpc_endpoint process_fid = message.process_fid e_rc = EAGAIN LOG.debug('Processing entrypoint request from remote endpoint' " '{}', process fid {}".format(remote_rpc_endpoint, str(process_fid))) sess = principal_rm = confds = None try: util = self.consul_util # When stopping, there's a possibility that hax may receive # an entrypoint request from motr land. In order to unblock # motr land, reply with entrypoint request with no confds # and RM endpoints as the processes might have already # stopped. rc_quorum = 0 rm_fid = Fid(0, 0) if self.is_stopping: confds = [] else: sess = util.get_leader_session_no_wait() principal_rm = util.get_session_node(sess) confds = util.get_confd_list() # Hax may receive entrypoint requests multiple times during its # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr # rconfc establishes connection with principal RM, in case of # principal RM failure, rconfc invalidates its confc and again # requests entrypoint in a hope that there will be another confd # and principal RM elected so that rconfc can resume its # functionality. During shutdown, when each motr process stops, # including confds, hax broadcasts M0_NC_FAILED event for every # STOPPED or FAILED motr process. Motr rconfc on receiving the # failed events for confds, goes re-requests entrypoint information # and this goes on in a loop. In order to break this loop, the # the entrypoint reply must only report alive confds and rm # endpoints. While doing this we need to handle the bootstrapping # case, so we wait until bootstrapping is done that is all the # motr services are up, we check the confd status and exclude # corresponding confd from the entrypoint reply. active_confds = [] if self.spiel_ready: for confd in confds: if not util.is_confd_failed(confd.fid): active_confds.append(confd) confds = active_confds if confds: rm_fid = util.get_rm_fid() rc_quorum = int(len(confds) / 2 + 1) rm_eps = None for svc in confds: if svc.node == principal_rm: rm_eps = svc.address break if confds and (not self.is_stopping) and (not rm_eps): if util.m0ds_stopping(): e_rc = 0 raise RuntimeError('No RM node found in Consul') except Exception: LOG.exception('Failed to get the data from Consul.' ' Replying with EAGAIN error code.') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0, make_array(FidStruct, []), make_array(c.c_char_p, []), 0, Fid(0, 0).to_c(), None) LOG.debug('Reply sent') return confd_fids = [x.fid.to_c() for x in confds] confd_eps = [make_c_str(x.address) for x in confds] LOG.debug('Passing the entrypoint reply to hax.c layer') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0, len(confds), make_array(FidStruct, confd_fids), make_array(c.c_char_p, confd_eps), rc_quorum, rm_fid.to_c(), make_c_str(rm_eps)) LOG.debug('Entrypoint request has been replied to')