def ha_nvec_set_process(self, event: HaNvecSetEvent) -> None: LOG.debug('Processing HaNvecSetEvent (nvec size = %s)', len(event.nvec)) self.consul_util.get_all_nodes() ha_states: List[HAState] = [] bcast_ss: List[HAState] = [] for n in event.nvec: fid = Fid.from_struct(n.note.no_id) obj_health = ObjHealth.from_ha_note_state(n.note.no_state) ha_states.append(HAState(fid, obj_health)) if n.note.no_state in { HaNoteStruct.M0_NC_REPAIRED, HaNoteStruct.M0_NC_ONLINE }: bcast_ss.append(HAState(fid, obj_health)) # In case of failed repair, roll back to failed state. elif n.note.no_state == HaNoteStruct.M0_NC_REPAIR: obj_health = ObjHealth.from_ha_note_state( HaNoteStruct.M0_NC_FAILED) bcast_ss.append(HAState(fid, obj_health)) # In case of failed rebalance, roll back to repaired state. elif n.note.no_state == HaNoteStruct.M0_NC_REBALANCE: obj_health = ObjHealth.from_ha_note_state( HaNoteStruct.M0_NC_REPAIRED) bcast_ss.append(HAState(fid, obj_health)) LOG.debug('got ha_states %s', ha_states) if bcast_ss: self.broadcast_ha_states(bcast_ss)
def to_ha_states(data: Any, consul_util: ConsulUtil) -> List[HAState]: """Converts a dictionary, obtained from JSON data, into a list of HA states. Format of an HA state: HAState(fid= <service fid>, status= <state>), where <state> is either 'online' or 'offline'. """ if not data: return [] def get_svc_node(checks: List[Dict[str, Any]], svc_id: str) -> str: for x in checks: if x.get('ServiceID') == svc_id: return str(x.get('Node')) return "" def get_svc_health(item: Any) -> ServiceHealth: node = get_svc_node(item['Checks'], item['Service']['ID']) LOG.debug('Checking current state of the process %s', item['Service']['ID']) status: ServiceHealth = consul_util.get_service_health( item['Service']['Service'], node, int(item['Service']['ID'])) return status return [ HAState(fid=create_process_fid(int(t['Service']['ID'])), status=get_svc_health(t)) for t in data ]
async def test_service_health_broadcast(hax_client, planner, status: str, health: ServiceHealth): service_health = [{ 'Node': { 'Node': 'localhost', 'Address': '10.1.10.12', }, 'Service': { 'ID': '12', 'Service': 'ios', 'Tags': [], 'Port': 8000, }, 'Checks': [ { 'Node': '12', 'CheckID': 'service:ios', 'Name': "Service 'ios' check", 'Status': status, 'Notes': '', 'Output': '', 'ServiceID': '12', 'ServiceName': 'ios', }, ], }] resp = await hax_client.post('/', json=service_health) assert resp.status == 200 assert planner.add_command.called planner.add_command.assert_called_once_with( BroadcastHAStates( states=[HAState(fid=Fid(0x7200000000000001, 12), status=health)], reply_to=None))
def _update_process_status(self, p: WorkPlanner, motr: Motr, event: ConfHaProcess) -> None: # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. motr_to_svc_status = { (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (ServiceHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (ServiceHealth.OFFLINE), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (ServiceHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (ServiceHealth.FAILED), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (ServiceHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (ServiceHealth.FAILED) } self.consul.update_process_status(event) if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)] motr.broadcast_ha_states( [HAState(fid=event.fid, status=svc_status)])
def fn(): proc_state_to_objhealth = { 'M0_CONF_HA_PROCESS_STARTING': ObjHealth.OFFLINE, 'M0_CONF_HA_PROCESS_STARTED': ObjHealth.OK, 'M0_CONF_HA_PROCESS_STOPPING': ObjHealth.OFFLINE, 'M0_CONF_HA_PROCESS_STOPPED': ObjHealth.OFFLINE } # import pudb.remote # pudb.remote.set_trace(term_size=(80, 40), port=9998) ha_states: List[HAState] = [] LOG.debug('process status: %s', data) for item in data: proc_val = base64.b64decode(item['Value']) proc_status = json.loads(str(proc_val.decode('utf-8'))) LOG.debug('process update item key %s item val: %s', item['Key'].split('/')[1], proc_status) proc_fid = Fid.parse(item['Key'].split('/')[1]) proc_state = proc_status['state'] proc_type = proc_status['type'] if (proc_type != 'M0_CONF_HA_PROCESS_M0MKFS' and proc_state in ('M0_CONF_HA_PROCESS_STARTED', 'M0_CONF_HA_PROCESS_STOPPED')): ha_states.append( HAState(fid=proc_fid, status=proc_state_to_objhealth[proc_state])) planner.add_command( BroadcastHAStates(states=ha_states, reply_to=None))
def test_nonmkfs_process_stop_causes_drive_offline(mocker, motr, consul_util): mocker.patch.object(consul_util.kv, 'kv_get', side_effect=create_stub_get('M0_CONF_HA_PROCESS_M0D')) mocker.patch.object(consul_util.kv, 'kv_put', return_value=0) mocker.patch.object(consul_util, 'update_drive_state') mocker.patch.object(consul_util, 'get_node_fid', return_value=Fid(0x6e00000000000001, 0x3)) mocker.patch.object(consul_util, 'get_node_encl_fid', return_value=Fid(0x6500000000000001, 0x4)) motr.broadcast_ha_states([ HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED) ]) assert consul_util.update_drive_state.called, \ 'The drive state should be updated in Consul KV' traces = motr._ffi.traces assert AssertionPlan( tr_and(tr_method('ha_broadcast'), contains_drive())).exists(traces), \ 'DRIVE must be broadcast when non-MKFS process is stopped'
def to_ha_state(self, objinfo: dict) -> Optional[HAState]: try: sdev_fid = self.confobjutil.drive_to_sdev_fid( objinfo['node'], objinfo['device']) except KeyError as error: logging.error('Invalid json payload, no key (%s) present', error) return None return HAState(sdev_fid, status=objinfo['state'])
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] for state in ha_states: # We are only concerned with process statuses. if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ServiceHealth.OK: if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STARTED'): continue if current_status in (ServiceHealth.FAILED, ServiceHealth.STOPPED): if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'): # Consul may report failure of a process multiple # times, so we don't want to send duplicate failure # notifications, it may cause delay in cleanup # activities. continue # XXX: # Sometime, there can be situation where Consul event is sent # and can be delayed, where state reported by Consul for a # given process can be in its past already, e.g. consul # reported process failure but when hax received the event, # process might have already restarted. In this case the event # still needs to be handled. Also, it is possible that Consul # reported failure but process status is not yet updated in # Consul services catalog, in such a case the reported status # can be true and cannot be just dropped. These scenarios must # be re-visited. if current_status not in (ServiceHealth.UNKNOWN, ServiceHealth.OFFLINE): # We also need to account and report the failure of remote # Motr processes to this node's hax and motr processes. # When Consul reports a remote process failure, hax # confirms its current status from Consul KV and updates # the list of failed services and also adds it to the # broadcast list. if current_status != ServiceHealth.OK: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED else: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED self.consul.update_process_status( ConfHaProcess( chp_event=event, chp_type=int( m0HaProcessType.M0_CONF_HA_PROCESS_M0D), chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: new_ha_states.append(state) return new_ha_states
def to_ha_state(self, objinfo: Dict[str, str]) -> Optional[HAState]: try: sdev_fid = self.confobjutil.drive_to_sdev_fid( objinfo['node'], objinfo['device']) state = ServiceHealth.OK if objinfo[ 'state'] == 'online' else ServiceHealth.FAILED except KeyError as error: LOG.error('Invalid json payload, no key (%s) present', error) return None return HAState(sdev_fid, status=state)
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): logging.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_event == 3: self.queue.put( BroadcastHAStates(states=[HAState(fid=fid, status='offline')], reply_to=None))
def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None: fid = Fid.parse(payload['conf_sdev']) if fid.is_null(): LOG.debug('Fid is 0:0. Skipping the message.') return q: Queue = Queue(1) self.planner.add_command( BroadcastHAStates(states=[HAState(fid, status=ObjHealth.FAILED)], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))
def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None: fid = Fid.parse(payload['conf_sdev']) if fid.is_null(): logging.debug('Fid is 0:0. Skipping the message.') return q: Queue = Queue(1) self.queue.put( BroadcastHAStates(states=[HAState(fid, status='offline')], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))
async def test_bq_stob_message_deserialized(hax_client, planner, herald, consul_util, mocker): def fake_get(key, allow_null): # ret = {'bq-delivered/192.168.0.28': ''} ret = {'bq-delivered/localhost': ''} return ret[key] mocker.patch.object(herald, 'wait_for_any') # # InboxFilter will try to read epoch - let's mock KV operations stob = StobId(Fid(12, 13), Fid(14, 15)) msg = StobIoqError(fid=Fid(5, 6), conf_sdev=Fid(0x103, 0x204), stob_id=stob, fd=42, opcode=4, rc=2, offset=0xBF, size=100, bshift=4) # Here we make sure that rea StobIoqError can be used as the payload # for STOB_IOQ_ERROR bq message. stob_payload = dump_json(msg) parsed_stob = simplejson.loads(stob_payload) mocker.patch.object(consul_util.kv, 'kv_put') mocker.patch.object(consul_util.kv, 'kv_get', fake_get) event_payload = {'message_type': 'STOB_IOQ_ERROR', 'payload': parsed_stob} event_str = simplejson.dumps(event_payload) b64: bytes = b64encode(event_str.encode()) b64_str = b64.decode() payload = [{ 'Key': 'bq/12', 'CreateIndex': 1793, 'ModifyIndex': 1793, 'LockIndex': 0, 'Flags': 0, 'Value': b64_str, 'Session': '' }] # Test execution resp = await hax_client.post('/watcher/bq', json=payload) # Validate now if resp.status != 200: resp_json = await resp.json() logging.getLogger('hax').debug('Response: %s', resp_json) assert resp.status == 200 planner.add_command.assert_called_once_with( ContainsStates( [HAState(fid=Fid(0x103, 0x204), status=ObjHealth.FAILED)]))
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_event == 3: self.queue.put( BroadcastHAStates( states=[HAState(fid=fid, status=ServiceHealth.FAILED)], reply_to=None))
def update_process_failure(self, q: Queue, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] for state in ha_states: # We are only concerned with process statuses. if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ServiceHealth.FAILED: self.consul.service_health_to_m0dstatus_update( state.fid, current_status) elif current_status == ServiceHealth.UNKNOWN: # We got service status as UNKNOWN, that means hax was # notified about process failure but hax couldn't # confirm if the process is in failed state or have # failed and restarted. So, we will not loose the # event and try again to confirm the real time # process status by enqueing a broadcast event # specific to this process. # It is expected that the process status gets # eventually confirmed as either failed or passing (OK). # This situation typically arises due to delay # in receiving failure notification during which the # corresponding process might be restarting or have # already restarted. Thus it is important to confirm # the real time status of the process before # broadcasting failure. current_status = ServiceHealth.OK q.put( BroadcastHAStates(states=[ HAState(fid=state.fid, status=ServiceHealth.FAILED) ], reply_to=None)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: new_ha_states.append(state) return new_ha_states
def handle(self, msg: Event) -> None: node_fid = self.cns.get_node_fid(msg.node_id) if not node_fid: LOG.warn('Unknown [node_id=%s] provided. HA event is ignored', msg.node_id) return get_health = self._get_status_by_text self.planner.add_command( BroadcastHAStates(states=[ HAState(fid=node_fid, status=get_health(msg.event_type)) ], reply_to=None))
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D: if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED: self.queue.put( BroadcastHAStates( states=[HAState(fid=fid, status=ServiceHealth.OK)], reply_to=None))
def get_local_service_health(self, service_name: str) -> HAState: local_nodename = self.get_local_nodename() srv_data: List[Dict[str, Any]] = self._get_service_health(service_name) local_services = [ srv for srv in srv_data if srv['Node']['Node'] == local_nodename ] if not local_services: raise RuntimeError( f'Node {local_nodename} has no service {service_name}') service = local_services[0] ok = all(x.get('Status') == 'passing' for x in service['Checks']) status = ServiceHealth.OK if ok else ServiceHealth.FAILED fid = create_process_fid(int(service['Service']['ID'])) return HAState(fid=fid, status=status)
def _update_process_status(self, q: Queue, event: ConfHaProcess) -> None: # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. self.consul.update_process_status(event) svc_status = m0HaProcessEvent.event_to_svchealth(event.chp_event) if event.chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D: # Broadcast the received motr process status to other motr # processes in the cluster. q.put( BroadcastHAStates( states=[HAState(fid=event.fid, status=svc_status)], reply_to=None))
def to_ha_state(self, objinfo: Dict[str, str]) -> Optional[HAState]: hastate_to_objstate = { 'online': ObjHealth.OK, 'failed': ObjHealth.FAILED, 'offline': ObjHealth.OFFLINE, 'repair': ObjHealth.REPAIR, 'repaired': ObjHealth.REPAIRED, 'rebalance': ObjHealth.REBALANCE } try: sdev_fid = self.confobjutil.drive_to_sdev_fid( objinfo['node'], objinfo['device']) state = hastate_to_objstate[objinfo['state']] except KeyError as error: LOG.error('Invalid json payload, no key (%s) present', error) return None return HAState(sdev_fid, status=state)
def to_ha_states(data: Any, consul_util: ConsulUtil) -> List[HAState]: """Converts a dictionary, obtained from JSON data, into a list of HA states. Format of an HA state: HAState(fid= <service fid>, status= <state>), where <state> is either 'online' or 'offline'. """ if not data: return [] def get_status(checks: List[Dict[str, Any]]) -> ServiceHealth: ok = all(x.get('Status') == 'passing' for x in checks) return ServiceHealth.OK if ok else ServiceHealth.FAILED return [ HAState(fid=create_process_fid(int(t['Service']['ID'])), status=get_status(t['Checks'])) for t in data ]
def handle(self, msg: Event) -> None: node_name = self.cns.get_node_name_by_machineid(msg.resource_id, allow_null=True) if not node_name: LOG.warn('Unknown [resource_id=%s] provided. HA event is ignored', msg.resource_id) return node_fid = self.cns.get_node_fid(node_name) if not node_fid: LOG.warn('Unknown [node_name=%s] provided. HA event is ignored', node_name) return get_health = self._get_status_by_text self.planner.add_command( BroadcastHAStates(states=[ HAState(fid=node_fid, status=get_health(msg.event_type)) ], reply_to=None))
async def test_bq_stob_message_type_recognized(hax_client, planner, herald, consul_util, mocker): def fake_get(key): ret = {'bq-delivered/192.168.0.28': ''} return ret[key] mocker.patch.object(herald, 'wait_for_any') # # InboxFilter will try to read epoch - let's mock KV operations mocker.patch.object(consul_util.kv, 'kv_put') mocker.patch.object(consul_util.kv, 'kv_get', fake_get) event_payload = { 'message_type': 'STOB_IOQ_ERROR', 'payload': { 'fid': '0x1:0x2', 'conf_sdev': '0x1:0x4' } } event_str = simplejson.dumps(event_payload) b64: bytes = b64encode(event_str.encode()) b64_str = b64.decode() payload = [{ 'Key': 'bq/12', 'CreateIndex': 1793, 'ModifyIndex': 1793, 'LockIndex': 0, 'Flags': 0, 'Value': b64_str, 'Session': '' }] # Test execution resp = await hax_client.post('/watcher/bq', json=payload) # Validate now if resp.status != 200: resp_json = await resp.json() logging.getLogger('hax').debug('Response: %s', resp_json) assert resp.status == 200 planner.add_command.assert_called_once_with( ContainsStates( [HAState(fid=Fid(0x1, 0x4), status=ServiceHealth.FAILED)]))
def to_ha_states(data: Any, consul_util: ConsulUtil) -> List[HAState]: """Converts a dictionary, obtained from JSON data, into a list of HA states. Format of an HA state: HAState(fid= <service fid>, status= <state>), where <state> is either 'online' or 'offline'. """ if not data: return [] ha_states = [] for node in data: svc_status = ObjHealth.OK for check in node['Checks']: if check.get('Status') != 'passing': svc_status = ObjHealth.FAILED svc_id = check.get('ServiceID') if svc_id: ha_states.append( HAState(fid=create_process_fid(int(svc_id)), status=svc_status)) LOG.debug('Reporting ha states: %s', ha_states) return ha_states
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] for state in ha_states: # We are only concerned with process statuses. if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ServiceHealth.OK: if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STARTED'): continue if current_status in (ServiceHealth.FAILED, ServiceHealth.STOPPED): if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'): # Consul may report failure of a process multiple # times, so we don't want to send duplicate failure # notifications, it may cause delay in cleanup # activities. continue if current_status == ServiceHealth.UNKNOWN: # We got service status as UNKNOWN, that means hax was # notified about process failure but hax couldn't # confirm if the process is in failed state or have # failed and restarted. So, we will not loose the # event and try again to confirm the real time # process status by enqueing a broadcast event # specific to this process. # It is expected that the process status gets # eventually confirmed as either failed or passing (OK). # This situation typically arises due to delay # in receiving failure notification during which the # corresponding process might be restarting or have # already restarted. Thus it is important to confirm # the real time status of the process before # broadcasting failure. current_status = ServiceHealth.UNKNOWN planner.add_command( BroadcastHAStates(states=[ HAState(fid=state.fid, status=ServiceHealth.FAILED) ], reply_to=None)) if current_status not in (ServiceHealth.UNKNOWN, ServiceHealth.OFFLINE): # We also need to account and report the failure of remote # Motr processes to this node's hax and motr processes. # When Consul reports a remote process failure, hax # confirms its current status from Consul KV and updates # the list of failed services and also adds it to the # broadcast list. if current_status != ServiceHealth.OK: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED else: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED self.consul.update_process_status( ConfHaProcess( chp_event=event, chp_type=int( m0HaProcessType.M0_CONF_HA_PROCESS_M0D), chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: new_ha_states.append(state) return new_ha_states
def test_process_failure(self): consul_util = ConsulUtil() consul_cache = InvocationCache() ffi = Mock(spec=['init_motr_api']) motr = Motr(ffi, None, None, consul_util) # Setup for the test: notification of a process failure # - failure here is an ios service and a disk # - dummy Consul reports all processes on the node are failed # - expect the node, enclosure, controller, drive, # process, and service to all be marked as failed # # Static names and fids for the setup are given here. node_name = 'testnode' hax_fid = Fid(0x7200000000000001, 0x6) site_fid = Fid(0x5300000000000001, 0x1) rack_fid = Fid(0x6100000000000001, 0x2) node_fid = Fid(0x6e00000000000001, 0x3) encl_fid = Fid(0x6500000000000001, 0x4) ctrl_fid = Fid(0x6300000000000001, 0x5) process_fid = Fid(0x7200000000000001, 0x15) service_fid = Fid(0x7300000000000001, 0xe) service_fid_typed = FidWithType(fid=service_fid, service_type='ios') drive_fid = Fid(0x6b00000000000001, 0x11) ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format( site_fid, rack_fid, encl_fid, ctrl_fid) ctrl_state = '{"state": "M0_NC_FAILED"}' # Set mock return values for the necessary Consul calls motr._is_mkfs = Mock(return_value=False) consul_util.get_hax_fid = Mock(return_value=hax_fid) consul_util.is_proc_client = Mock(return_value=False) consul_util.get_services_by_parent_process = Mock( return_value=[service_fid_typed]) consul_util.get_disks_by_parent_process = Mock( return_value=[drive_fid]) consul_util.get_process_node = Mock(return_value=node_name) consul_util.get_node_name_by_fid = Mock(return_value=node_name) consul_util.get_node_fid = Mock(return_value=node_fid) consul_util.get_node_encl_fid = Mock(return_value=encl_fid) consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid]) # These failure indications are here to trigger specific code paths for # node failure. Additional tests can cover different scenarios (e.g. # drive failure but node still up), which will set differernt results # for these calls. consul_util.all_io_services_failed = Mock(return_value=True) consul_util.get_sdev_state = Mock( return_value=HaNoteStruct.M0_NC_FAILED) consul_util.get_ctrl_state = Mock( return_value=m0HaObjState.M0_NC_FAILED) consul_util.get_ctrl_state_updates = Mock( return_value=[PutKV(key=ctrl_path, value=ctrl_state)]) # We'll use these mocks to check that expected updates are happening. consul_util.update_drive_state = Mock() consul_util.set_process_state = Mock() consul_util.set_node_state = Mock() consul_util.set_encl_state = Mock() motr._ha_broadcast = Mock() motr._write_updates = Mock() # Send the mock event. motr.broadcast_ha_states( [HAState(fid=process_fid, status=ObjHealth.FAILED)], notify_devices=True, broadcast_hax_only=False, kv_cache=consul_cache) # ConsulUtil is responsible for the actual KV updates, just check # here that the appropriate util function is called for each # component. consul_util.update_drive_state.assert_called_with([drive_fid], ObjHealth.OFFLINE, device_event=False) consul_util.set_process_state.assert_called_with( process_fid, ObjHealth.FAILED) consul_util.set_node_state.assert_called_with(node_fid, ObjHealth.FAILED) consul_util.set_encl_state.assert_called_with(encl_fid, ObjHealth.FAILED, kv_cache=consul_cache) # This KV update is batched, so the check looks different. motr._write_updates.assert_any_call( [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache) # Check hax broadcast. We should see states updated to FAILED. broadcast_list = motr._ha_broadcast.call_args[0][0] self.assertTrue(_has_failed_note(broadcast_list, node_fid)) self.assertTrue(_has_failed_note(broadcast_list, encl_fid)) self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid)) self.assertTrue(_has_failed_note(broadcast_list, process_fid)) self.assertTrue(_has_failed_note(broadcast_list, service_fid)) self.assertTrue(_has_failed_note(broadcast_list, drive_fid))
def test_broadcast_io_service_failure(mocker, planner, motr, consumer, consul_util): def new_kv(key: str, val: str): return { 'Key': key, 'CreateIndex': 1793, 'ModifyIndex': 1793, 'LockIndex': 0, 'Flags': 0, 'Value': val, 'Session': '' } def my_get(key: str, recurse: bool = False, **kwds): if key == 'm0conf/nodes' and recurse: return [ new_kv(k, v) for k, v in [('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15', json.dumps({ "name": "m0_server", "state": "offline" })), ('m0conf/nodes/cmu/processes/6/services/rm', '16'), ('m0conf/nodes/localhost/processes/7/services/rms', '17'), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15/services' '/0x7300000000000001:0x17', json.dumps({ "name": "ios", "state": "failed" })), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0xa/services/0x7300000000000001' ':0xc', json.dumps({ "name": "ios", "state": "failed" }))] ] elif key == 'm0conf/sites' and recurse: return [ new_kv(k, v) for k, v in [('m0conf/sites/0x5300000000000001:0x1/racks' '/0x6100000000000001:0x2/encls/0x6500000000000001:0x4' '/ctrls/0x6300000000000001:0x5', json.dumps({"state": "M0_NC_UNKNOWN"})), ('m0conf/sites/0x5300000000000001:0x1/racks' '/0x6100000000000001:0x2/encls/0x6500000000000001:0x4' '/ctrls/0x6300000000000001:0x6', json.dumps({"state": "M0_NC_UNKNOWN"}))] ] elif (key == 'm0conf/nodes/0x6e00000000000001:0x3' '/processes' and recurse): return [ new_kv(k, v) for k, v in [('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15', json.dumps({ "name": "m0_server", "state": "failed" })), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15/services/0x7300000000000001' ':0x17', json.dumps({ "name": "ios", "state": "failed" })), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0xa/services/0x7300000000000001:0xc', json.dumps({ "name": "ios", "state": "failed" }))] ] elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15' and recurse): return [ new_kv(k, v) for k, v in [('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15', json.dumps({ "name": "m0_server", "state": "failed" })), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15/services/0x7300000000000001' ':0x17', json.dumps({ "name": "ios", "state": "failed" }))] ] elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15'): return new_kv( 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15', json.dumps({ "name": "m0_server", "state": "failed" })) elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0xa'): return new_kv( 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0xa', json.dumps({ "name": "m0_server", "state": "online" })) elif key == 'm0conf/nodes/localhost/processes/7/services/rms': return new_kv('m0conf/nodes/localhost/processes/7/services/rms', '17') elif key == 'localhost/processes/0x7200000000000001:0x15': return new_kv( 'localhost/processes/0x7200000000000001', json.dumps({ 'type': 'M0_CONF_HA_PROCESS_OTHER', 'state': 'Unknown' })) elif key == 'm0conf/nodes/0x6e00000000000001:0x3': return new_kv( 'm0conf/nodes/0x6e00000000000001:0x3', json.dumps({ "name": "localhost", "state": "M0_NC_UNKNOWN" })) raise RuntimeError(f'Unexpected call: key={key}, recurse={recurse}') mocker.patch.object(consul_util.kv, 'kv_get', side_effect=my_get) # TODO: Handle 'kv_put' by updating kv returned by 'kv_get' mocker.patch.object(consul_util.kv, 'kv_put', return_value=0) mocker.patch.object(consul_util, 'get_node_fid', return_value=Fid(0x6e00000000000001, 0x3)) mocker.patch.object(consul_util, 'get_node_encl_fid', return_value=Fid(0x6500000000000001, 0x4)) motr.broadcast_ha_states([ HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED) ]) traces = motr._ffi.traces assert AssertionPlan(tr_and( tr_method('ha_broadcast'), io_service_failed())).exists(traces), 'IOservice failure not broadcast' assert AssertionPlan(tr_and(tr_method('ha_broadcast'), node_fid_failed())).not_exists(traces), \ 'Node failure should not be broadcast'
def _do_work(self, q: Queue, motr: Motr): ffi = motr._ffi LOG.info('Handler thread has started') ffi.adopt_motr_thread() def pull_msg(): try: return q.get(block=False) except Empty: return None try: while True: try: LOG.debug('Waiting for the next message') item = pull_msg() while item is None: time.sleep(0.2) if self.is_stopped: raise StopIteration() item = pull_msg() LOG.debug('Got %s message from queue', item) if isinstance(item, FirstEntrypointRequest): LOG.debug('first entrypoint request, broadcast FAILED') ids: List[MessageId] = motr.broadcast_ha_states([ HAState(fid=item.process_fid, status=ServiceHealth.FAILED) ]) LOG.debug('waiting for broadcast of %s for ep: %s', ids, item.remote_rpc_endpoint) self.herald.wait_for_all(HaLinkMessagePromise(ids)) motr.send_entrypoint_request_reply( EntrypointRequest( reply_context=item.reply_context, req_id=item.req_id, remote_rpc_endpoint=item.remote_rpc_endpoint, process_fid=item.process_fid, git_rev=item.git_rev, pid=item.pid, is_first_request=item.is_first_request)) elif isinstance(item, EntrypointRequest): # While replying any Exception is catched. In such a # case, the motr process will receive EAGAIN and # hence will need to make new attempt by itself motr.send_entrypoint_request_reply(item) elif isinstance(item, ProcessEvent): self._update_process_status(q, item.evt) elif isinstance(item, HaNvecGetEvent): fn = motr.ha_nvec_get_reply # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. decorated = (repeat_if_fails(wait_seconds=5))(fn) decorated(item) elif isinstance(item, BroadcastHAStates): LOG.info('HA states: %s', item.states) ha_states = self.update_process_failure(q, item.states) result: List[MessageId] = motr.broadcast_ha_states( ha_states) if item.reply_to: item.reply_to.put(result) elif isinstance(item, StobIoqError): LOG.info('Stob IOQ: %s', item.fid) payload = dump_json(item) LOG.debug('Stob IOQ JSON: %s', payload) offset = self.eq_publisher.publish('stob-ioq', payload) LOG.debug('Written to epoch: %s', offset) elif isinstance(item, SnsRepairStatus): LOG.info('Requesting SNS repair status') status = motr.get_repair_status(item.fid) LOG.info('SNS repair status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStatus): LOG.info('Requesting SNS rebalance status') status = motr.get_rebalance_status(item.fid) LOG.info('SNS rebalance status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStart): LOG.info('Requesting SNS rebalance start') motr.start_rebalance(item.fid) elif isinstance(item, SnsRebalanceStop): LOG.info('Requesting SNS rebalance stop') motr.stop_rebalance(item.fid) elif isinstance(item, SnsRebalancePause): LOG.info('Requesting SNS rebalance pause') motr.pause_rebalance(item.fid) elif isinstance(item, SnsRebalanceResume): LOG.info('Requesting SNS rebalance resume') motr.resume_rebalance(item.fid) elif isinstance(item, SnsRepairStart): LOG.info('Requesting SNS repair start') motr.start_repair(item.fid) elif isinstance(item, SnsRepairStop): LOG.info('Requesting SNS repair stop') motr.stop_repair(item.fid) elif isinstance(item, SnsRepairPause): LOG.info('Requesting SNS repair pause') motr.pause_repair(item.fid) elif isinstance(item, SnsRepairResume): LOG.info('Requesting SNS repair resume') motr.resume_repair(item.fid) else: LOG.warning('Unsupported event type received: %s', item) except StopIteration: raise except Exception: # no op, swallow the exception LOG.exception('**ERROR**') except StopIteration: ffi.shun_motr_thread() finally: LOG.info('Handler thread has exited')
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] proc_Health_to_status = { ObjHealth.OFFLINE: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED, ObjHealth.FAILED: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED, ObjHealth.OK: m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED } try: for state in ha_states: if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ObjHealth.UNKNOWN: continue proc_status_remote = self.consul.get_process_status( state.fid) proc_status: Any = None # MKFS states are upated by the node corresponding to a # given process. So we ignore notifications for mkfs # processes. if proc_status_remote.proc_type in ( 'Unknown', m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS.name): continue proc_type = m0HaProcessType.str_to_Enum( proc_status_remote.proc_type) # Following cases are handled here, # 1. Delayed consul service failure notification: # - We re-confirm the current process state before # notifying the process as offline/failed. # 2. Consul reported process failure, current process # state is offline (this means the corresponding node # is online, i.e. hax and consul are online): # - So process's status in consul kv might not be updated # as the process died abruptly. In this case we handle # it as local process failure, update the process # status in consul kv and notify motr. # 3. Consul reported process failure, current process # state is failed (this means the node corresponding to # the process also failed, i.e. hax and consul are no # more): # - Process's status in consul kv might not be updated as # the node went down abruptly. In this case, when # consul reports failure for corresponding node # processes, Hare verifies the node status and # accordingly Hare RC node processes the failures. # This may take some time if Consul server loose # the quorum and take time sync up the state. # 4. Consul reported process failure, probably due to mkfs # process completion (m0tr mkfs and m0ds share the same # fid). which got delayed and process has starting now: # - Hare checks the current status of the process but it # is possible that the process state is not synced up # yet within the quorum. In this case, we continue # processing the failure event but once the process # starts successfully Hare will update and notify the # process state eventually. # 5. For some reason Consul may report a process as # offline and subsequently report it as online, this # may happen due to intermittent monitor failure: # - Hare must handle the change in process states # accordingly in-order to maintain the eventual # consistency of the cluster state. proc_status = proc_Health_to_status.get(current_status) LOG.debug('current_status: %s proc_status_remote: %s', current_status, proc_status_remote.proc_status) if proc_status is not None: LOG.debug('proc_status: %s', proc_status.name) if proc_status_remote.proc_status != proc_status.name: if (self.consul.am_i_rc() or self.consul.is_proc_local(state.fid)): # Probably process node failed, in such a # case, only RC must be allowed to update # the process's persistent state. # Or, if the node's alive then allow the node # to update the local process's state. self.consul.update_process_status( ConfHaProcess(chp_event=proc_status, chp_type=proc_type, chp_pid=0, fid=state.fid)) # RC or not RC, i.e. even without persistent state # update, it is important that the notification to # local motr processes must still be sent. new_ha_states.append( HAState(fid=state.fid, status=current_status)) if not self.consul.is_proc_local(state.fid): proc_status_local = ( self.consul.get_process_status_local( state.fid)) # Consul monitors a process every 1 second and # this notification is sent to every node. Thus # to avoid notifying about a process multiple # times about the same status every node # maintains a local copy of the remote process # status, which is checked everytime a consul # notification is received and accordingly # the status is notified locally to all the local # motr processes. if (proc_status_local.proc_status != proc_status.name): self.consul.update_process_status_local( ConfHaProcess(chp_event=proc_status, chp_type=proc_type, chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: continue else: new_ha_states.append(state) except Exception as e: raise HAConsistencyException('failed to process ha states') from e return new_ha_states
def _update_process_status(self, p: WorkPlanner, motr: Motr, event: ConfHaProcess) -> None: LOG.info('Updating process status: %s', event.fid) # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. motr_to_svc_status = { (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED)} if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)] broadcast_hax_only = False if ((event.chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS) or (event.fid == self.consul.get_hax_fid())): # Motr-mkfs processes do not require updates on their peer # mkfs processes. Motr-mkfs is an independent and typically a # one-time operation. So avoid broadcasting a motr-mkfs state # to the peer motr-mkfs processes but hax still needs to be # notified in-order to disconnect the hax-motr halink when # motr-mkfs process stops. broadcast_hax_only = True LOG.debug('chp_type %d broadcast_hax_only %s', event.chp_type, broadcast_hax_only) motr.broadcast_ha_states( [HAState(fid=event.fid, status=svc_status)], broadcast_hax_only=broadcast_hax_only) self.consul.update_process_status(event) # If we are receiving M0_CONF_HA_PROCESS_STARTED for M0D processes # then we will check if all the M0D processes on the local node are # started. If yes then we are going to send node online event to # MessageBus if event.chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED: try: util: ConsulUtil = ConsulUtil() producer = get_producer(util) if producer: producer.check_and_send(parent_resource_type=ObjT.NODE, fid=event.fid, resource_status='online') else: LOG.warning('Could not sent an event as producer' ' is not available') except Exception as e: LOG.warning("Send event failed due to '%s'", e)