def _update_process_status(self, p: WorkPlanner, motr: Motr, event: ConfHaProcess) -> None: # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. motr_to_svc_status = { (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (ServiceHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (ServiceHealth.OFFLINE), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (ServiceHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (ServiceHealth.FAILED), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (ServiceHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (ServiceHealth.FAILED) } self.consul.update_process_status(event) if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)] motr.broadcast_ha_states( [HAState(fid=event.fid, status=svc_status)])
def run_in_consumer(mocker, msg: BaseMessage, planner: WorkPlanner, consumer: ConsumerThread, motr: Motr) -> None: mocker.patch.object(planner, 'get_next_command', side_effect=[msg, Die()]) profile = Profile(fid=create_profile_fid(22), name='the_pool', pool_names=['name1']) motr.start('endpoint', create_process_fid(120), create_process_fid(15), profile) consumer._do_work(planner, motr)
def _execute(self, motr: Motr): try: LOG.info('byte-count updater thread has started') while not self.stopped: if not self.consul.am_i_rc(): wait_for_event(self.event, self.interval_sec) continue if not motr.is_spiel_ready(): wait_for_event(self.event, self.interval_sec) continue processes: List[Tuple[Fid, ObjHealth]] = \ self.consul.get_proc_fids_with_status(['ios']) if not processes: continue try: for ios, status in processes: if status == ObjHealth.OK: byte_count: ByteCountStats = \ motr.get_proc_bytecount(ios) LOG.debug('Received bytecount: %s', byte_count) if not byte_count: continue self.consul.update_pver_bc(byte_count) pver_items = self._get_pver_with_pver_status(motr) if not pver_items: continue pver_bc = self._calculate_bc_per_pver(pver_items) self.consul.update_bc_for_dg_category(pver_bc, pver_items) except HAConsistencyException: LOG.exception('Failed to update Consul KV ' 'due to an intermittent error. The ' 'error is swallowed since new attempts ' 'will be made timely') except BytecountException as e: LOG.exception( 'Failed due to %s. Aborting this iteration.' ' Waiting for next attempt.', e.message) wait_for_event(self.event, self.interval_sec) except InterruptedException: # No op. _sleep() has interrupted before the timeout exceeded: # the application is shutting down. # There are no resources that we need to dispose specially. pass except Exception: LOG.exception('Aborting due to an error') finally: LOG.exception('byte-count updater thread exited')
def _execute(self, motr: Motr): try: ffi = motr._ffi LOG.info('filesystem stats updater thread has started') ffi.adopt_motr_thread() self._ensure_motr_all_started() while not self.stopped: if not self._am_i_rc(): self._sleep(self.interval_sec) continue started = self._ioservices_running() if not all(started): self._sleep(self.interval_sec) continue result: int = motr.start_rconfc() if result == 0: stats = motr.get_filesystem_stats() motr.stop_rconfc() if not stats: continue LOG.debug('FS stats are as follows: %s', stats) now_time = datetime.datetime.now() data = FsStatsWithTime(stats=stats, timestamp=now_time.timestamp(), date=now_time.isoformat()) try: self.consul.update_fs_stats(data) except HAConsistencyException: LOG.debug('Failed to update Consul KV ' 'due to an intermittent error. The ' 'error is swallowed since new attempts ' 'will be made timely') self._sleep(self.interval_sec) except InterruptedException: # No op. _sleep() has interrupted before the timeout exceeded: # the application is shutting down. # There are no resources that we need to dispose specially. pass except Exception: LOG.exception('Aborting due to an error') finally: LOG.debug('Releasing motr-related resources for this thread') ffi.shun_motr_thread() LOG.debug('filesystem stats updater thread exited')
def _execute(self, motr: Motr): try: LOG.debug('rconfc starter thread has started') self.consul.ensure_motr_all_started(self.event) while (not self.stopped) and (not motr.spiel_ready): started = self.consul.ensure_ioservices_running() if not all(started): wait_for_event(self.event, 5) continue result: int = motr.start_rconfc() if result == 0: motr.spiel_ready = True except InterruptedException: # No op. sleep() has interrupted before the timeout exceeded: # the application is shutting down. # There are no resources that we need to dispose specially. pass except Exception: LOG.exception('Aborting due to an error') finally: LOG.debug('rconfc starter thread exited')
def main(): # Note: no logging must happen before this call. # Otherwise the log configuration will not apply. setup_logging() # [KN] The elements in the queue will appear if # 1. A callback is invoked from ha_link (this will happen in a motr # thread which must be free ASAP) # 2. A new HA notification has come form Consul via HTTP # [KN] The messages are consumed by Python thread created by # _run_qconsumer_thread function. # # [KN] Note: The server is launched in the main thread. planner = WorkPlanner() util: ConsulUtil = ConsulUtil() _remove_stale_session(util) cfg: HL_Fids = _get_motr_fids(util) LOG.info('Welcome to HaX') LOG.info(f'Setting up ha_link interface with the options as follows: ' f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, ' f'HA fid = {cfg.ha_fid}') ffi = HaxFFI() herald = DeliveryHerald() motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util) # Note that consumer thread must be started before we invoke motr.start(..) # Reason: hax process will send entrypoint request and somebody needs # to reply it. # TODO make the number of threads configurable consumer_threads = [ _run_qconsumer_thread(planner, motr, herald, util, i) for i in range(4) ] try: # [KN] We use just the first profile for Spiel API for now. motr.start(cfg.hax_ep, process=cfg.hax_fid, ha_service=cfg.ha_fid, profile=cfg.profiles[0]) LOG.info('Motr API has been started') rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util) stats_updater = _run_stats_updater_thread(motr, consul_util=util) event_poller = _run_thread(create_ha_thread(planner, util)) # [KN] This is a blocking call. It will work until the program is # terminated by signal server = ServerRunner(planner, herald, consul_util=util) server.run(threads_to_wait=[ *consumer_threads, stats_updater, rconfc_starter, event_poller ]) except Exception: LOG.exception('Exiting due to an exception') finally: motr.fini()
def _execute(self, motr: Motr): try: LOG.info('filesystem stats updater thread has started') while not self.stopped: if not self.consul.am_i_rc(): wait_for_event(self.event, self.interval_sec) continue if (not motr.is_spiel_ready() or ( not all(self.consul.ensure_ioservices_running()))): wait_for_event(self.event, self.interval_sec) continue stats = motr.get_filesystem_stats() if not stats: continue LOG.debug('FS stats are as follows: %s', stats) now_time = datetime.datetime.now() data = FsStatsWithTime(stats=stats, timestamp=now_time.timestamp(), date=now_time.isoformat()) try: self.consul.update_fs_stats(data) except HAConsistencyException: LOG.debug('Failed to update Consul KV ' 'due to an intermittent error. The ' 'error is swallowed since new attempts ' 'will be made timely') wait_for_event(self.event, self.interval_sec) except InterruptedException: # No op. _sleep() has interrupted before the timeout exceeded: # the application is shutting down. # There are no resources that we need to dispose specially. pass except Exception: LOG.exception('Aborting due to an error') finally: LOG.debug('filesystem stats updater thread exited')
def main(): # Note: no logging must happen before this call. # Otherwise the log configuration will not apply. _setup_logging() # [KN] The elements in the queue will appear if # 1. A callback is invoked from ha_link (this will happen in a motr # thread which must be free ASAP) # 2. A new HA notification has come form Consul via HTTP # [KN] The messages are consumed by Python thread created by # _run_qconsumer_thread function. # # [KN] Note: The server is launched in the main thread. q: Queue = Queue(maxsize=8) util: ConsulUtil = ConsulUtil() cfg = _get_motr_fids(util) LOG.info('Welcome to HaX') LOG.info(f'Setting up ha_link interface with the options as follows: ' f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, ' f'HA fid = {cfg.ha_fid}, RM fid = {cfg.rm_fid}') ffi = HaxFFI() herald = DeliveryHerald() motr = Motr(queue=q, rm_fid=cfg.rm_fid, ffi=ffi, herald=herald, consul_util=util) # Note that consumer thread must be started before we invoke motr.start(..) # Reason: hax process will send entrypoint request and somebody needs # to reply it. consumer = _run_qconsumer_thread(q, motr, herald) try: motr.start(cfg.hax_ep, process=cfg.hax_fid, ha_service=cfg.ha_fid, rm_service=cfg.rm_fid) LOG.info('Motr API has been started') stats_updater = _run_stats_updater_thread(motr, consul_util=util) # [KN] This is a blocking call. It will work until the program is # terminated by signal run_server(q, herald, consul_util=util, threads_to_wait=[consumer, stats_updater]) except Exception: LOG.exception('Exiting due to an exception') finally: motr.close()
def _get_pver_with_pver_status( self, motr: Motr) -> Optional[Dict[str, PverInfo]]: ''' Storing a map of pver_fid and its state. Ex of pver state: PverInfo(fid=0x7600000000000001:0x3e, state=0, data_units=1, parity_units=0, pool_width=10, unit_size=0) Pver data is stored in consul kv in format key = ioservices/0x7200000000000001:0x20/pvers/ 0x7600000000000001:0x6/users/1 value = {"bc": 4096, "object_cnt": 1} ''' iosservice_items = self.consul.kv.kv_get('ioservices/', recurse=True) pver_items = {} if iosservice_items: for k in iosservice_items: p_ver = k['Key'].split('/')[3] if p_ver not in pver_items: pver_info: PverInfo = motr.get_pver_status( Fid.parse(p_ver)) pver_items[p_ver] = pver_info LOG.debug('Received pool version and status: %s', pver_items) return pver_items
def _do_work(self, q: Queue, motr: Motr): ffi = motr._ffi LOG.info('Handler thread has started') ffi.adopt_motr_thread() def pull_msg(): try: return q.get(block=False) except Empty: return None try: while True: try: LOG.debug('Waiting for the next message') item = pull_msg() while item is None: time.sleep(0.2) if self.is_stopped: raise StopIteration() item = pull_msg() LOG.debug('Got %s message from queue', item) if isinstance(item, FirstEntrypointRequest): LOG.debug('first entrypoint request, broadcast FAILED') ids: List[MessageId] = motr.broadcast_ha_states([ HAState(fid=item.process_fid, status=ServiceHealth.FAILED) ]) LOG.debug('waiting for broadcast of %s for ep: %s', ids, item.remote_rpc_endpoint) self.herald.wait_for_all(HaLinkMessagePromise(ids)) motr.send_entrypoint_request_reply( EntrypointRequest( reply_context=item.reply_context, req_id=item.req_id, remote_rpc_endpoint=item.remote_rpc_endpoint, process_fid=item.process_fid, git_rev=item.git_rev, pid=item.pid, is_first_request=item.is_first_request)) elif isinstance(item, EntrypointRequest): # While replying any Exception is catched. In such a # case, the motr process will receive EAGAIN and # hence will need to make new attempt by itself motr.send_entrypoint_request_reply(item) elif isinstance(item, ProcessEvent): self._update_process_status(q, item.evt) elif isinstance(item, HaNvecGetEvent): fn = motr.ha_nvec_get_reply # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. decorated = (repeat_if_fails(wait_seconds=5))(fn) decorated(item) elif isinstance(item, BroadcastHAStates): LOG.info('HA states: %s', item.states) ha_states = self.update_process_failure(q, item.states) result: List[MessageId] = motr.broadcast_ha_states( ha_states) if item.reply_to: item.reply_to.put(result) elif isinstance(item, StobIoqError): LOG.info('Stob IOQ: %s', item.fid) payload = dump_json(item) LOG.debug('Stob IOQ JSON: %s', payload) offset = self.eq_publisher.publish('stob-ioq', payload) LOG.debug('Written to epoch: %s', offset) elif isinstance(item, SnsRepairStatus): LOG.info('Requesting SNS repair status') status = motr.get_repair_status(item.fid) LOG.info('SNS repair status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStatus): LOG.info('Requesting SNS rebalance status') status = motr.get_rebalance_status(item.fid) LOG.info('SNS rebalance status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStart): LOG.info('Requesting SNS rebalance start') motr.start_rebalance(item.fid) elif isinstance(item, SnsRebalanceStop): LOG.info('Requesting SNS rebalance stop') motr.stop_rebalance(item.fid) elif isinstance(item, SnsRebalancePause): LOG.info('Requesting SNS rebalance pause') motr.pause_rebalance(item.fid) elif isinstance(item, SnsRebalanceResume): LOG.info('Requesting SNS rebalance resume') motr.resume_rebalance(item.fid) elif isinstance(item, SnsRepairStart): LOG.info('Requesting SNS repair start') motr.start_repair(item.fid) elif isinstance(item, SnsRepairStop): LOG.info('Requesting SNS repair stop') motr.stop_repair(item.fid) elif isinstance(item, SnsRepairPause): LOG.info('Requesting SNS repair pause') motr.pause_repair(item.fid) elif isinstance(item, SnsRepairResume): LOG.info('Requesting SNS repair resume') motr.resume_repair(item.fid) else: LOG.warning('Unsupported event type received: %s', item) except StopIteration: raise except Exception: # no op, swallow the exception LOG.exception('**ERROR**') except StopIteration: ffi.shun_motr_thread() finally: LOG.info('Handler thread has exited')
def motr(mocker, ffi, planner, herald, consul_util) -> Motr: motr = Motr(ffi, planner, herald, consul_util) return motr
def _update_process_status(self, p: WorkPlanner, motr: Motr, event: ConfHaProcess) -> None: LOG.info('Updating process status: %s', event.fid) # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. motr_to_svc_status = { (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED)} if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)] broadcast_hax_only = False if ((event.chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS) or (event.fid == self.consul.get_hax_fid())): # Motr-mkfs processes do not require updates on their peer # mkfs processes. Motr-mkfs is an independent and typically a # one-time operation. So avoid broadcasting a motr-mkfs state # to the peer motr-mkfs processes but hax still needs to be # notified in-order to disconnect the hax-motr halink when # motr-mkfs process stops. broadcast_hax_only = True LOG.debug('chp_type %d broadcast_hax_only %s', event.chp_type, broadcast_hax_only) motr.broadcast_ha_states( [HAState(fid=event.fid, status=svc_status)], broadcast_hax_only=broadcast_hax_only) self.consul.update_process_status(event) # If we are receiving M0_CONF_HA_PROCESS_STARTED for M0D processes # then we will check if all the M0D processes on the local node are # started. If yes then we are going to send node online event to # MessageBus if event.chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED: try: util: ConsulUtil = ConsulUtil() producer = get_producer(util) if producer: producer.check_and_send(parent_resource_type=ObjT.NODE, fid=event.fid, resource_status='online') else: LOG.warning('Could not sent an event as producer' ' is not available') except Exception as e: LOG.warning("Send event failed due to '%s'", e)
def _do_work(self, planner: WorkPlanner, motr: Motr): LOG.info('Handler thread has started') try: while True: try: LOG.debug('Waiting for the next message') item = planner.get_next_command() LOG.debug('Got %s message from planner', item) if isinstance(item, FirstEntrypointRequest): motr.send_entrypoint_request_reply( EntrypointRequest( reply_context=item.reply_context, req_id=item.req_id, remote_rpc_endpoint=item.remote_rpc_endpoint, process_fid=item.process_fid, git_rev=item.git_rev, pid=item.pid, is_first_request=item.is_first_request)) elif isinstance(item, EntrypointRequest): # While replying any Exception is catched. In such a # case, the motr process will receive EAGAIN and # hence will need to make new attempt by itself motr.send_entrypoint_request_reply(item) elif isinstance(item, ProcessEvent): self._update_process_status(planner, motr, item.evt) elif isinstance(item, HaNvecGetEvent): fn = motr.ha_nvec_get_reply # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. decorated = (repeat_if_fails(wait_seconds=5))(fn) decorated(item) elif isinstance(item, HaNvecSetEvent): fn = motr.ha_nvec_set_process # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. decorated = (repeat_if_fails(wait_seconds=5))(fn) decorated(item) elif isinstance(item, BroadcastHAStates): LOG.info('HA states: %s', item.states) ha_states = self.update_process_failure( planner, item.states) result: List[MessageId] = motr.broadcast_ha_states( ha_states) if item.reply_to: item.reply_to.put(result) elif isinstance(item, StobIoqError): LOG.info('Stob IOQ: %s', item.fid) payload = dump_json(item) LOG.debug('Stob IOQ JSON: %s', payload) offset = self.eq_publisher.publish('stob-ioq', payload) LOG.debug('Written to epoch: %s', offset) elif isinstance(item, SnsRepairStatus): LOG.info('Requesting SNS repair status') status = motr.get_repair_status(item.fid) LOG.info('SNS repair status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStatus): LOG.info('Requesting SNS rebalance status') status = motr.get_rebalance_status(item.fid) LOG.info('SNS rebalance status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStart): LOG.info('Requesting SNS rebalance start') motr.start_rebalance(item.fid) elif isinstance(item, SnsRebalanceStop): LOG.info('Requesting SNS rebalance stop') motr.stop_rebalance(item.fid) elif isinstance(item, SnsRebalancePause): LOG.info('Requesting SNS rebalance pause') motr.pause_rebalance(item.fid) elif isinstance(item, SnsRebalanceResume): LOG.info('Requesting SNS rebalance resume') motr.resume_rebalance(item.fid) elif isinstance(item, SnsRepairStart): LOG.info('Requesting SNS repair start') motr.start_repair(item.fid) elif isinstance(item, SnsRepairStop): LOG.info('Requesting SNS repair stop') motr.stop_repair(item.fid) elif isinstance(item, SnsRepairPause): LOG.info('Requesting SNS repair pause') motr.pause_repair(item.fid) elif isinstance(item, SnsRepairResume): LOG.info('Requesting SNS repair resume') motr.resume_repair(item.fid) elif isinstance(item, Die): raise StopIteration() else: LOG.warning('Unsupported event type received: %s', item) except StopIteration: raise except Exception: # no op, swallow the exception LOG.exception('**ERROR**') finally: planner.notify_finished(item) except StopIteration: LOG.info('Consumer Stopped') if self.idx == 0: motr.stop() finally: LOG.info('Handler thread has exited')
def main(): # Note: no logging must happen before this call. # Otherwise the log configuration will not apply. setup_logging() set_locale() inject.configure(di_configuration) state = inject.instance(HaxGlobalState) # [KN] The elements in the work planner will appear if # 1. A callback is invoked from ha_link (this will happen in a motr # thread which must be free ASAP) # 2. A new HA notification has come form Consul via HTTP # [KN] The messages are consumed by Python threads created by # _run_qconsumer_thread function. # # [KN] Note: The server is launched in the main thread. planner = WorkPlanner() def handle_signal(sig, frame): state.set_stopping() planner.shutdown() # This is necessary to allow hax to exit early if Consul is not available # (otherwise _get_motr_fids() may be retrying forever even if the hax # process needs to shutdown). signal.signal(signal.SIGINT, handle_signal) util: ConsulUtil = ConsulUtil() # Avoid removing session on hax start as this will happen # on every node, thus leader election will keep re-triggering # until the final hax node starts, this will delay further # bootstrapping operations. _remove_stale_session(util) cfg: HL_Fids = _get_motr_fids(util) hax_http_port = util.get_hax_http_port() util.init_motr_processes_status() LOG.info('Welcome to HaX') LOG.info(f'Setting up ha_link interface with the options as follows: ' f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, ' f'HA fid = {cfg.ha_fid}') ffi = HaxFFI() herald = DeliveryHerald() motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util) # Note that consumer thread must be started before we invoke motr.start(..) # Reason: hax process will send entrypoint request and somebody needs # to reply it. # TODO make the number of threads configurable consumer_threads = [ _run_qconsumer_thread(planner, motr, herald, util, i) for i in range(32) ] try: # [KN] We use just the first profile for Spiel API for now. motr.start(cfg.hax_ep, process=cfg.hax_fid, ha_service=cfg.ha_fid, profile=cfg.profiles[0]) LOG.info('Motr API has been started') rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util) stats_updater = _run_stats_updater_thread(motr, consul_util=util) bc_updater = _run_bc_updater_thread(motr, consul_util=util) event_poller = _run_thread(create_ha_thread(planner, util)) # [KN] This is a blocking call. It will work until the program is # terminated by signal server = ServerRunner(planner, herald, consul_util=util, hax_state=state) server.run(threads_to_wait=[ *consumer_threads, stats_updater, bc_updater, rconfc_starter, event_poller ], port=hax_http_port) except Exception: LOG.exception('Exiting due to an exception') finally: motr.fini()
def test_process_failure(self): consul_util = ConsulUtil() consul_cache = InvocationCache() ffi = Mock(spec=['init_motr_api']) motr = Motr(ffi, None, None, consul_util) # Setup for the test: notification of a process failure # - failure here is an ios service and a disk # - dummy Consul reports all processes on the node are failed # - expect the node, enclosure, controller, drive, # process, and service to all be marked as failed # # Static names and fids for the setup are given here. node_name = 'testnode' hax_fid = Fid(0x7200000000000001, 0x6) site_fid = Fid(0x5300000000000001, 0x1) rack_fid = Fid(0x6100000000000001, 0x2) node_fid = Fid(0x6e00000000000001, 0x3) encl_fid = Fid(0x6500000000000001, 0x4) ctrl_fid = Fid(0x6300000000000001, 0x5) process_fid = Fid(0x7200000000000001, 0x15) service_fid = Fid(0x7300000000000001, 0xe) service_fid_typed = FidWithType(fid=service_fid, service_type='ios') drive_fid = Fid(0x6b00000000000001, 0x11) ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format( site_fid, rack_fid, encl_fid, ctrl_fid) ctrl_state = '{"state": "M0_NC_FAILED"}' # Set mock return values for the necessary Consul calls motr._is_mkfs = Mock(return_value=False) consul_util.get_hax_fid = Mock(return_value=hax_fid) consul_util.is_proc_client = Mock(return_value=False) consul_util.get_services_by_parent_process = Mock( return_value=[service_fid_typed]) consul_util.get_disks_by_parent_process = Mock( return_value=[drive_fid]) consul_util.get_process_node = Mock(return_value=node_name) consul_util.get_node_name_by_fid = Mock(return_value=node_name) consul_util.get_node_fid = Mock(return_value=node_fid) consul_util.get_node_encl_fid = Mock(return_value=encl_fid) consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid]) # These failure indications are here to trigger specific code paths for # node failure. Additional tests can cover different scenarios (e.g. # drive failure but node still up), which will set differernt results # for these calls. consul_util.all_io_services_failed = Mock(return_value=True) consul_util.get_sdev_state = Mock( return_value=HaNoteStruct.M0_NC_FAILED) consul_util.get_ctrl_state = Mock( return_value=m0HaObjState.M0_NC_FAILED) consul_util.get_ctrl_state_updates = Mock( return_value=[PutKV(key=ctrl_path, value=ctrl_state)]) # We'll use these mocks to check that expected updates are happening. consul_util.update_drive_state = Mock() consul_util.set_process_state = Mock() consul_util.set_node_state = Mock() consul_util.set_encl_state = Mock() motr._ha_broadcast = Mock() motr._write_updates = Mock() # Send the mock event. motr.broadcast_ha_states( [HAState(fid=process_fid, status=ObjHealth.FAILED)], notify_devices=True, broadcast_hax_only=False, kv_cache=consul_cache) # ConsulUtil is responsible for the actual KV updates, just check # here that the appropriate util function is called for each # component. consul_util.update_drive_state.assert_called_with([drive_fid], ObjHealth.OFFLINE, device_event=False) consul_util.set_process_state.assert_called_with( process_fid, ObjHealth.FAILED) consul_util.set_node_state.assert_called_with(node_fid, ObjHealth.FAILED) consul_util.set_encl_state.assert_called_with(encl_fid, ObjHealth.FAILED, kv_cache=consul_cache) # This KV update is batched, so the check looks different. motr._write_updates.assert_any_call( [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache) # Check hax broadcast. We should see states updated to FAILED. broadcast_list = motr._ha_broadcast.call_args[0][0] self.assertTrue(_has_failed_note(broadcast_list, node_fid)) self.assertTrue(_has_failed_note(broadcast_list, encl_fid)) self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid)) self.assertTrue(_has_failed_note(broadcast_list, process_fid)) self.assertTrue(_has_failed_note(broadcast_list, service_fid)) self.assertTrue(_has_failed_note(broadcast_list, drive_fid))