def prepare_svc(self, svc_id: str, name: str): ep = self.provider.get_service_ep(svc_id) if not ep: raise RuntimeError('Cannot get service endpoint.') addr = self.get_service_addr(ep) port = self.get_service_port(ep) checks: Dict[str, Any] = {} checks['args'] = ['/opt/seagate/cortx/hare/libexec/check-service'] checks['interval'] = '1s' checks['status'] = 'warning' # get svc checks args as per svc name if name == 'hax': checks['args'].append('--hax') elif name in ('confd', 'ios'): fid = Fid(ObjT.PROCESS.value, int(svc_id)) checks['args'].extend(['--fid', str(fid)]) elif name == 's3service': fid = Fid(ObjT.PROCESS.value, int(svc_id)) s3svc = 's3server@' + str(fid) checks['args'].extend(['--svc', s3svc]) return Service(id=svc_id, name=name, address=addr, port=port, checks=[checks])
def test_nonmkfs_process_stop_causes_drive_offline(mocker, motr, consul_util): mocker.patch.object(consul_util.kv, 'kv_get', side_effect=create_stub_get('M0_CONF_HA_PROCESS_M0D')) mocker.patch.object(consul_util.kv, 'kv_put', return_value=0) mocker.patch.object(consul_util, 'update_drive_state') mocker.patch.object(consul_util, 'get_node_fid', return_value=Fid(0x6e00000000000001, 0x3)) mocker.patch.object(consul_util, 'get_node_encl_fid', return_value=Fid(0x6500000000000001, 0x4)) motr.broadcast_ha_states([ HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED) ]) assert consul_util.update_drive_state.called, \ 'The drive state should be updated in Consul KV' traces = motr._ffi.traces assert AssertionPlan( tr_and(tr_method('ha_broadcast'), contains_drive())).exists(traces), \ 'DRIVE must be broadcast when non-MKFS process is stopped'
def node_to_drive_fid(self, node_name: str, drive: str): sdev_fid: Fid = Fid(0, 0) # We extract the sdev fid as follows, # e.g. node_name=ssc-vm-c-0553.colo.seagate.com # drive=/dev/vdf # 1. m0conf/nodes/ssc-vm-c-0553.colo.seagate.com/processes/41/ # services/ios:43 # 2. Create ioservice motr fid # 3. fetch consul kv for ios fid, # m0conf/nodes/0x6e00000000000001:0x20/processes/ # 0x7200000000000001:0x29/services/0x7300000000000001:0x2b/ # sdevs/0x6400000000000001:0x2c: # {"path": "/dev/vdf", "state": "M0_NC_UNKNOWN"} # 4. find drive name in the json value and extract sdev fid from the # key 0x6400000000000001:0x2c # 5. Create sdev fid from sdev fid key. process_items = self.kv.kv_get(f'm0conf/nodes/{node_name}/processes', recurse=True) for x in process_items: if '/ios' in x['Key']: fidk_ios = x['Value'] ios_fid = create_service_fid(int(fidk_ios)) sdev_items = self.kv.kv_get('m0conf/nodes', recurse=True) for x in sdev_items: if f'/{ios_fid}/' in x['Key']: if json.loads(x['Value'])['path'] == drive: # Using constant index 8 for the sdev fid. # Fix this by changing the Consul schema to have # mapping of drive path to sdev direct mapping. sdev_fid_item = x['Key'].split('/')[8] sdev_fidk = Fid.parse(sdev_fid_item).key sdev_fid = create_sdev_fid(sdev_fidk) break return self.sdev_to_drive_fid(sdev_fid)
async def test_bq_stob_message_deserialized(hax_client, planner, herald, consul_util, mocker): def fake_get(key, allow_null): # ret = {'bq-delivered/192.168.0.28': ''} ret = {'bq-delivered/localhost': ''} return ret[key] mocker.patch.object(herald, 'wait_for_any') # # InboxFilter will try to read epoch - let's mock KV operations stob = StobId(Fid(12, 13), Fid(14, 15)) msg = StobIoqError(fid=Fid(5, 6), conf_sdev=Fid(0x103, 0x204), stob_id=stob, fd=42, opcode=4, rc=2, offset=0xBF, size=100, bshift=4) # Here we make sure that rea StobIoqError can be used as the payload # for STOB_IOQ_ERROR bq message. stob_payload = dump_json(msg) parsed_stob = simplejson.loads(stob_payload) mocker.patch.object(consul_util.kv, 'kv_put') mocker.patch.object(consul_util.kv, 'kv_get', fake_get) event_payload = {'message_type': 'STOB_IOQ_ERROR', 'payload': parsed_stob} event_str = simplejson.dumps(event_payload) b64: bytes = b64encode(event_str.encode()) b64_str = b64.decode() payload = [{ 'Key': 'bq/12', 'CreateIndex': 1793, 'ModifyIndex': 1793, 'LockIndex': 0, 'Flags': 0, 'Value': b64_str, 'Session': '' }] # Test execution resp = await hax_client.post('/watcher/bq', json=payload) # Validate now if resp.status != 200: resp_json = await resp.json() logging.getLogger('hax').debug('Response: %s', resp_json) assert resp.status == 200 planner.add_command.assert_called_once_with( ContainsStates( [HAState(fid=Fid(0x103, 0x204), status=ObjHealth.FAILED)]))
def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid, rm_service: Fid): LOG.debug('Starting m0_halon_interface') self._process_fid = process result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint), process.to_c(), ha_service.to_c(), rm_service.to_c()) if result: LOG.error( 'Cannot start Motr API. m0_halon_interface::start' ' returned non-zero code (%s)', result) raise RuntimeError('Cannot start m0_halon_interface.' 'Please check Motr logs for more details.')
def get_node_encl_fid(self, node: str) -> Optional[Fid]: """ Returns the fid of the enclosure for the given node. Parameters: node : hostname of the node. """ # Example, # { # "key": "m0conf/sites/0x5300000000000001:0x1/ # racks/0x6100000000000001:0x2/encls/ # 0x6500000000000001:0x4", # "value": "{\"node\": \"0x6e00000000000001:0x3\", # \"state\": \"M0_NC_UNKNOWN\"}" # }, node_fid = self.get_node_fid(node) if not node_fid: return None encl_items = self.kv.kv_get('m0conf/sites', recurse=True) regex = re.compile('^m0conf\\/.*\\/racks\\/.*\\/encls\\/([^/]+)$') for encl in encl_items: match_result = re.match(regex, encl['Key']) if not match_result: continue encl_value = json.loads(encl['Value']) if 'node' in encl_value and encl_value['node'] == str(node_fid): encl_fid: str = match_result.group(1) return Fid.parse(encl_fid) return None
def get_node_ctrl_fid(self, node: str) -> Optional[Fid]: """ Returns the fid of the controller for the given node. Parameters: node : hostname of the node. """ # Example, # { # "key": "m0conf/sites/0x5300000000000001:0x1/ # racks/0x6100000000000001:0x2/encls/ # 0x6500000000000001:0x4/ctrls/0x6300000000000001:0x5", # }, encl_fid = self.get_node_encl_fid(node) if not encl_fid: return None ctrl_items = self.kv.kv_get('m0conf/sites', recurse=True) regex = re.compile( f'^m0conf\\/.*\\/racks\\/.*\\/encls\\/{encl_fid}\\/ctrls\\/' '([^/]+)$') for ctrl in ctrl_items: match_result = re.match(regex, ctrl['Key']) if not match_result: continue ctrl_fid: str = match_result.group(1) return Fid.parse(ctrl_fid) return None
def get_disks_by_parent_process(self, process_fid: Fid, svc_fid: Fid) -> List[Fid]: node_items = self.kv.kv_get('m0conf/nodes', recurse=True) # This is the RegExp to match the keys in Consul KV that describe # the Motr processes and services that are enclosed into the Motr # process that has the given process_fid. # # Note: we assume that process_fid uniquely identifies the given # process within the whole cluster (that's why we are not interested # in the hostnames here). # # Examples of the key that will match: # m0conf/nodes/0x6e00000000000001:0x3b/processes/ # 0x7200000000000001:0x44/services/0x7300000000000001:0x46 regex = re.compile( f'^m0conf\\/.*\\/processes\\/{process_fid}\\/services\\/' f'{svc_fid}\\/(.+)$') disks = [] for node in node_items: match_result = re.match(regex, node['Key']) if not match_result: continue sdev_fid_item = node['Key'].split('/')[8] sdev_fidk = Fid.parse(sdev_fid_item).key sdev_fid = create_sdev_fid(sdev_fidk) disk_fid = self.sdev_to_drive_fid(sdev_fid) disks.append(disk_fid) return disks
def _generate_sub_disks(self, note: HaNoteStruct, services: List[FidWithType], cns: ConsulUtil, kv_cache=None) -> List[HaNoteStruct]: disk_list = [] new_state = note.no_state proc_fid = Fid.from_struct(note.no_id) state = (ObjHealth.OK if new_state == HaNoteStruct.M0_NC_ONLINE else ObjHealth.OFFLINE) is_mkfs = self._is_mkfs(proc_fid) mkfs_down = is_mkfs and state != ObjHealth.OK if not mkfs_down: for svc in services: disk_list += cns.get_disks_by_parent_process(proc_fid, svc.fid) if disk_list: # XXX: Need to check the current state of the device, transition # to ONLINE only in case of an explicit request or iff the prior # state of the device is UNKNOWN/OFFLINE. if not mkfs_down: # We don't mark the devices as failed if the process is MKFS # and if its effective status is STOPPED (see EOS-24124). cns.update_drive_state(disk_list, state, device_event=False) LOG.debug('proc fid=%s encloses %d disks as follows: %s', proc_fid, len(disk_list), disk_list) drive_ha_notes: List[HaNoteStruct] = [] for drive_id in disk_list: # Get the drive state from Consul KV. dstate = cns.get_sdev_state(ObjT.DRIVE, drive_id.key) drive_ha_notes.append( HaNoteStruct(no_id=drive_id.to_c(), no_state=dstate)) return drive_ha_notes
def fn(): proc_state_to_objhealth = { 'M0_CONF_HA_PROCESS_STARTING': ObjHealth.OFFLINE, 'M0_CONF_HA_PROCESS_STARTED': ObjHealth.OK, 'M0_CONF_HA_PROCESS_STOPPING': ObjHealth.OFFLINE, 'M0_CONF_HA_PROCESS_STOPPED': ObjHealth.OFFLINE } # import pudb.remote # pudb.remote.set_trace(term_size=(80, 40), port=9998) ha_states: List[HAState] = [] LOG.debug('process status: %s', data) for item in data: proc_val = base64.b64decode(item['Value']) proc_status = json.loads(str(proc_val.decode('utf-8'))) LOG.debug('process update item key %s item val: %s', item['Key'].split('/')[1], proc_status) proc_fid = Fid.parse(item['Key'].split('/')[1]) proc_state = proc_status['state'] proc_type = proc_status['type'] if (proc_type != 'M0_CONF_HA_PROCESS_M0MKFS' and proc_state in ('M0_CONF_HA_PROCESS_STARTED', 'M0_CONF_HA_PROCESS_STOPPED')): ha_states.append( HAState(fid=proc_fid, status=proc_state_to_objhealth[proc_state])) planner.add_command( BroadcastHAStates(states=ha_states, reply_to=None))
def get_pver_status(self, pver_fid: Fid) -> PverInfo: status: PverInfo = self._ffi.pver_status_fetch(self._ha_ctx, pver_fid.to_c()) if not status: raise BytecountException('Pool version status unavailable') LOG.debug('Pver status for pver %s: %s', pver_fid, status.state) return status
def is_node_failed(self, proc_note: HaNoteStruct, kv_cache=None): proc_fid = Fid.from_struct(proc_note.no_id) assert ObjT.PROCESS.value == proc_fid.container node = self.consul_util.get_process_node(proc_fid, kv_cache=kv_cache) return self.consul_util.all_io_services_failed(node, kv_cache=kv_cache)
async def test_service_health_broadcast(hax_client, planner, status: str, health: ServiceHealth): service_health = [{ 'Node': { 'Node': 'localhost', 'Address': '10.1.10.12', }, 'Service': { 'ID': '12', 'Service': 'ios', 'Tags': [], 'Port': 8000, }, 'Checks': [ { 'Node': '12', 'CheckID': 'service:ios', 'Name': "Service 'ios' check", 'Status': status, 'Notes': '', 'Output': '', 'ServiceID': '12', 'ServiceName': 'ios', }, ], }] resp = await hax_client.post('/', json=service_health) assert resp.status == 200 assert planner.add_command.called planner.add_command.assert_called_once_with( BroadcastHAStates( states=[HAState(fid=Fid(0x7200000000000001, 12), status=health)], reply_to=None))
def notify_node_status_by_process( self, proc_note: HaNoteStruct) -> List[HaNoteStruct]: # proc_note.no_state is of int type new_state = ServiceHealth.from_ha_note_state(proc_note.no_state) proc_fid = Fid.from_struct(proc_note.no_id) assert ObjT.PROCESS.value == proc_fid.container LOG.debug('Notifying node status for process_fid=%s state=%s', proc_fid, new_state) node = self.consul_util.get_process_node(proc_fid) if new_state == ServiceHealth.OK: # Node can have multiple controllers. Node can be online, with # a single controller running online. # If we receive process 'OK', only the process state is # updated. So, we need to update the corresponding # controller state. ctrl_fid = self.consul_util.get_ioservice_ctrl_fid(proc_fid) if ctrl_fid: self.consul_util.set_ctrl_state(ctrl_fid, new_state) node_fid = self.consul_util.get_node_fid(node) notes = self.add_node_state_by_fid(node_fid, new_state) notes += self.add_enclosing_devices_by_node(node_fid, new_state, node=node) return notes
def ha_nvec_set_process(self, event: HaNvecSetEvent) -> None: LOG.debug('Processing HaNvecSetEvent (nvec size = %s)', len(event.nvec)) self.consul_util.get_all_nodes() ha_states: List[HAState] = [] bcast_ss: List[HAState] = [] for n in event.nvec: fid = Fid.from_struct(n.note.no_id) obj_health = ObjHealth.from_ha_note_state(n.note.no_state) ha_states.append(HAState(fid, obj_health)) if n.note.no_state in { HaNoteStruct.M0_NC_REPAIRED, HaNoteStruct.M0_NC_ONLINE }: bcast_ss.append(HAState(fid, obj_health)) # In case of failed repair, roll back to failed state. elif n.note.no_state == HaNoteStruct.M0_NC_REPAIR: obj_health = ObjHealth.from_ha_note_state( HaNoteStruct.M0_NC_FAILED) bcast_ss.append(HAState(fid, obj_health)) # In case of failed rebalance, roll back to repaired state. elif n.note.no_state == HaNoteStruct.M0_NC_REBALANCE: obj_health = ObjHealth.from_ha_note_state( HaNoteStruct.M0_NC_REPAIRED) bcast_ss.append(HAState(fid, obj_health)) LOG.debug('got ha_states %s', ha_states) if bcast_ss: self.broadcast_ha_states(bcast_ss)
def pause_repair(self, pool_fid: Fid): LOG.debug('Pausing repair for pool %s', pool_fid) result: int = self._ffi.pause_repair(self._ha_ctx, pool_fid.to_c()) if result: raise RepairRebalanceException( 'Failed to send SPIEL request "sns_repair_pause", please' + ' check Motr logs for more details.') LOG.debug('Repairing paused for pool %s', pool_fid)
def get_proc_bytecount(self, proc_fid: Fid) -> ByteCountStats: bytecount: ByteCountStats = self._ffi.proc_bytecount_fetch( self._ha_ctx, proc_fid.to_c()) if not bytecount: raise BytecountException('Bytecount stats unavailable') LOG.debug('Bytecount status for proc fid: %s, stats =%s', str(bytecount.proc_fid), bytecount.pvers) return bytecount
def resume_rebalance(self, pool_fid: Fid): LOG.debug('Resuming rebalance for pool %s', pool_fid) result: int = self._ffi.resume_rebalance(self._ha_ctx, pool_fid.to_c()) if result: raise RepairRebalanceException( 'Failed to send SPIEL request "sns_rebalance_resume",' + 'please check Motr logs for more details.') LOG.debug('Rebalancing resumed for pool %s', pool_fid)
def stop_rebalance(self, pool_fid: Fid): logging.debug('Stopping rebalance for pool %s', pool_fid) result: int = self._ffi.stop_rebalance(self._ha_ctx, pool_fid.to_c()) if result: raise RepairRebalanceException( 'Failed to send SPIEL request "sns_rebalance_stop",' + 'please check Motr logs for more details.') logging.debug('Rebalancing stoped for pool %s', pool_fid)
def start_repair(self, pool_fid: Fid): logging.debug('Initiating repair for pool %s', pool_fid) result: int = self._ffi.start_repair(self._ha_ctx, pool_fid.to_c()) if result: raise RepairRebalanceException( 'Failed to send SPIEL request "sns_repair_start", please' + ' check Motr logs for more details.') logging.debug('Repairing started for pool %s', pool_fid)
def get_service_process_fid(self, svc_fid: Fid) -> Fid: assert ObjT.SERVICE.value == svc_fid.container node_items = self.kv.kv_get('m0conf/nodes', recurse=True) keys = self.get_service_keys(node_items, svc_fid.key) assert len(keys) == 1 process_fid: str = keys[0].split('/')[4] pfid = Fid.parse(process_fid) return pfid
def entrypoint(): return EntrypointRequest(reply_context='test', req_id=Uint128(1, 2), remote_rpc_endpoint='endpoint', process_fid=Fid(1, 2), git_rev='HEAD', pid=123, is_first_request=False)
def get_rebalance_status(self, pool_fid: Fid) -> List[ReprebStatus]: LOG.debug('Fetching rebalance status for pool %s', pool_fid) status: List[ReprebStatus] = self._ffi.rebalance_status( self._ha_ctx, pool_fid.to_c()) if status is None: raise RepairRebalanceException('rebalance status unavailable') LOG.debug('rebalance status for pool %s: %s', pool_fid, status) return status
def add_node_state_by_fid(self, node_fid: Fid, new_state: ObjHealth) -> List[HaNoteStruct]: # Update the node state in consul kv. self.consul_util.set_node_state(node_fid, new_state) state_int = new_state.to_ha_note_status() return [HaNoteStruct(no_id=node_fid.to_c(), no_state=state_int)]
def generate_confd(self, svc_id: str, hax_ep: str, motr_conf_dir: str): fid = Fid(ObjT.PROCESS.value, int(svc_id)) ep = self.provider.get_service_ep(svc_id) filename = f'm0d-{fid}' contents = (f"MOTR_M0D_EP='{ep}'\n" f"MOTR_HA_EP='{hax_ep}'\n" f"MOTR_PROCESS_FID='{fid}'\n" f"MOTR_CONF_XC='{motr_conf_dir}/confd.xc'\n") self._write_file(motr_conf_dir + self.sysconf_dir + filename, contents)
def get_svc_fids(self, svc_name: str) -> List[str]: IDs = self.get_all_svc_ids() id_map = { 'hax': IDs['HAX_ID'], 'confd': IDs['CONFD_IDs'], 'ios': IDs['IOS_IDs'], 's3': IDs['S3_IDs'] } return [str(Fid(ObjT.PROCESS.value, int(x))) for x in id_map[svc_name]]
def generate_ios(self, svc_id: str, hax_ep: str, motr_conf_dir: str): fid = Fid(ObjT.PROCESS.value, int(svc_id)) ep = self.provider.get_service_ep(svc_id) meta_data = self.provider.get_ios_meta_data(svc_id) filename = f'm0d-{fid}' contents = (f"MOTR_M0D_EP='{ep}'\n" f"MOTR_HA_EP='{hax_ep}'\n" f"MOTR_PROCESS_FID='{fid}'\n") if meta_data: contents += f'MOTR_BE_SEG_PATH={meta_data}\n' self._write_file(motr_conf_dir + self.sysconf_dir + filename, contents)
def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid, profile: Profile): LOG.debug('Starting m0_halon_interface') self._process_fid = process self._profile = profile @repeat_if_fails() def _get_rm_fid() -> Fid: return self.consul_util.get_rm_fid() rm_fid = _get_rm_fid() result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint), process.to_c(), ha_service.to_c(), rm_fid.to_c()) if result: LOG.error( 'Cannot start Motr API. m0_halon_interface::start' ' returned non-zero code (%s)', result) raise RuntimeError('Cannot start m0_halon_interface.' 'Please check Motr logs for more details.')
def _generate_sub_services(self, note: HaNoteStruct, cns: ConsulUtil) -> List[HaNoteStruct]: new_state = note.no_state fid = Fid.from_struct(note.no_id) service_list = cns.get_services_by_parent_process(fid) LOG.debug('Process fid=%s encloses %s services as follows: %s', fid, len(service_list), service_list) return [ HaNoteStruct(no_id=x.fid.to_c(), no_state=new_state) for x in service_list ]
def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None: fid = Fid.parse(payload['conf_sdev']) if fid.is_null(): LOG.debug('Fid is 0:0. Skipping the message.') return q: Queue = Queue(1) self.planner.add_command( BroadcastHAStates(states=[HAState(fid, status=ObjHealth.FAILED)], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))