def _get_motr_fids(util: ConsulUtil) -> HL_Fids: try: hax_ep: str = util.get_hax_endpoint() hax_fid: Fid = util.get_hax_fid() ha_fid: Fid = util.get_ha_fid() profiles = util.get_profiles() if (not hax_ep or not hax_fid or not ha_fid or not profiles): raise HAConsistencyException('fids and profiles unavailable') except Exception as e: raise HAConsistencyException('failed to get motr fids') from e return HL_Fids(hax_ep, hax_fid, ha_fid, profiles)
def get_session_node(self, session_id: str) -> str: try: session = self.cns.session.info(session_id)[1] return str(session['Node']) # principal RM except ConsulException as e: raise HAConsistencyException('Failed to communicate to' ' Consul Agent') from e
def kv_put_in_transaction(self, tx_payload: List[TxPutKV]) -> bool: def to_payload(v: TxPutKV) -> Dict[str, Any]: b64: bytes = b64encode(v.value.encode()) b64_str = b64.decode() if v.cas: return { 'KV': { 'Key': v.key, 'Value': b64_str, 'Verb': 'cas', 'Index': v.cas } } return {'KV': {'Key': v.key, 'Value': b64_str, 'Verb': 'set'}} try: self.cns.txn.put([to_payload(i) for i in tx_payload]) return True except ClientError: # If a transaction fails, Consul returns HTTP 409 with the # JSON payload describing the reason why the transaction # was rejected. # The library transforms HTTP 409 into generic ClientException. # Unfortunately, we can't easily extract the payload from it. return False except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to put value to KV') from e
def get_node_health(self, node: str) -> str: try: node_data = self.cns.health.node(node)[1] return str(node_data[0]['Status']) except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException( f'Failed to get {node} node health') from e
def processes_node(cns: Consul, node_name: str) -> Dict[str, List[Process]]: """Processes grouped by Consul service name.""" try: processes: Dict[str, List[Process]] = {} cns_util = ConsulUtil(raw_client=cns) is_local = node_name == cns_util.get_local_nodename() for node in cns.catalog.nodes()[1]: if node_name != node['Node']: continue for svc in cns.health.node(node['Node'])[1]: svc_name = svc['ServiceName'] if svc_name: fidk = int(svc['ServiceID']) processes.setdefault(svc_name, []).append( Process(node=node['Node'], consul_name=svc_name, systemd_name=get_systemd_name(fidk, svc_name), fidk=fidk, is_local=is_local, status=svc['Status'])) consul_status = 'passing' if consul_is_active_at(node['Node']) \ else 'offline' processes.setdefault('consul', []).append( Process(node=node['Node'], consul_name='consul', systemd_name='hare-consul-agent', fidk=0, is_local=is_local, status=consul_status)) return processes except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Could not access Consul services')\ from e
def _service_by_name(self, hostname: str, svc_name: str) -> Dict[str, Any]: cat = self.catalog for svc in cat.get_services(svc_name): if svc['Node'] == hostname: return svc raise HAConsistencyException( f'No {svc_name!r} Consul service found at node {hostname!r}')
def get_kv(cns: Consul, key: str) -> str: try: kv: Dict[str, bytes] = cns.kv.get(key)[1] return kv['Value'].decode() if kv and kv['Value'] is not None else '' except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Could not access Consul KV')\ from e
def _start(self, port: int) -> None: try: web_address = self._get_my_hostname() LOG.info(f'Starting HTTP server at {web_address}:{port} ...') web.run_app(self.app, host=web_address, port=port) except Exception as e: raise HAConsistencyException( 'Failed to start web server, trying again...') from e
def get_local_nodename(self) -> str: try: local_nodename = os.environ.get('HARE_HAX_NODE_NAME') or \ self.cns.agent.self()['Config']['NodeName'] return local_nodename except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to communicate ' 'to Consul Agent') from e
def kv_put(self, key: str, data: str, **kwargs) -> bool: """ Helper method that should be used by default in this class whenver we want to invoke Consul.kv.put() """ assert key try: return self.cns.kv.put(key, data, **kwargs) except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to put value to KV') from e
def kv_get_raw(self, key: str, **kwargs) -> Tuple[int, Any]: """ Helper method that should be used by default in this class whenver we want to invoke Consul.kv.get() """ assert key try: return self.cns.kv.get(key, **kwargs) except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Could not access Consul KV') from e
def _catalog_service_names(self) -> List[str]: """ Return full list of service names currently registered in Consul server. """ try: services: Dict[str, List[Any]] = self.cns.catalog.services()[1] return list(services.keys()) except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Cannot access Consul catalog') from e
def destroy_session(self, session: str) -> None: """ Destroys the given Consul Session by name. The method doesn't raise any exception if the session doesn't exist. """ try: self.cns.session.destroy(session) except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to communicate to' ' Consul Agent: ' + str(e))
def get_services(self, svc_name: str) -> List[Dict[str, Any]]: """ Return service(s) registered in Consul by the given name. """ try: # TODO refactor catalog operations into a separate class return self.cns.catalog.service(service=svc_name)[1] except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException( 'Could not access Consul Catalog') from e
def get_leader_session_no_wait(self) -> str: """ Returns the RC leader session. HAConsistencyException is raised immediately if there is no RC leader selected at the moment. """ leader = self.kv.kv_get('leader') try: return str(leader['Session']) except KeyError: raise HAConsistencyException( 'Could not get the leader from Consul')
def _local_service_by_name(self, name: str) -> Dict[str, Any]: """ Returns the service data by its name assuming that it runs at the same node to the current hax process. """ try: local_nodename = os.environ.get('HARE_HAX_NODE_NAME') or \ self.cns.agent.self()['Config']['NodeName'] except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to communicate ' 'to Consul Agent') from e return self._service_by_name(local_nodename, name)
def get_local_nodename(self) -> str: """ Returns the logical name of the current node. This is the name that Consul is aware of. In other words, whenever Consul references a node, it will use the names that this function can return. """ try: local_nodename = os.environ.get('HARE_HAX_NODE_NAME') or \ self.cns.agent.self()['Config']['NodeName'] return local_nodename except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to communicate ' 'to Consul Agent') from e
def get_service_health(self, node: str, svc_id: int) -> ServiceHealth: """ Returns current status of a Consul service identified by the given svc_id for a given node. """ try: node_data: List[Dict[str, Any]] = self.cns.health.node(node)[1] if not node_data: return ServiceHealth.FAILED node_status = str(node_data[0]['Status']) if node_status != 'passing': return ServiceHealth.FAILED status = ServiceHealth.UNKNOWN for item in node_data: if item['ServiceID'] == str(svc_id): if item['Status'] == 'passing': status = ServiceHealth.OK elif item['Status'] == 'warning': fid = create_process_fid(svc_id) svc_consul_status = self.get_svc_status(fid) if (svc_consul_status in ('M0_CONF_HA_PROCESS_STARTING', 'M0_CONF_HA_PROCESS_STARTED')): # We are returning unknown status for the service # as we cannot confirm the actual status of the # service from the available data. The node status # is 'passing' but service status is 'warning' and # the service status in Consul is either # M0_CONF_HA_PROCESS_STARTING or # M0_CONF_HA_PROCESS_STARTED. So we are not sure # because it is possible that the service has # failed but consul status is not yet updated yet. # So we return unknown now, the caller can re try # after sometime and once all the 3 status are # either passing or atleast the node status itself # is failed, we will return OK or FAILED # accordingly. status = ServiceHealth.UNKNOWN else: status = ServiceHealth.FAILED else: status = ServiceHealth.FAILED return status except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to communicate ' 'to Consul Agent') from e return status
def _configure(self) -> None: try: # We can't use broad 0.0.0.0 IP address to make it possible to run # multiple hax instances at the same machine (i.e. in failover # situation). # Instead, every hax will use a private IP only. node_address = self._get_my_hostname() # Note that bq-delivered mechanism must use a unique # node name rather than broad '0.0.0.0' that doesn't # identify the node from outside. inbox_filter = InboxFilter( OffsetStorage(node_address, key_prefix='bq-delivered', kv=self.consul_util.kv)) conf_obj = ConfObjUtil(self.consul_util) planner = self.planner herald = self.herald consul_util = self.consul_util app = self._create_server() app.add_routes([ web.get('/', hello_reply), web.get('/v1/cluster/status', hctl_stat), web.get('/v1/cluster/status/bytecount', bytecount_stat), web.get('/v1/cluster/fetch-fids', hctl_fetch_fids), web.post('/', process_ha_states(planner, consul_util)), web.post( '/watcher/bq', process_bq_update(inbox_filter, BQProcessor(planner, herald, conf_obj))), web.post('/watcher/processes', process_state_update(planner)), web.post('/api/v1/sns/{operation}', process_sns_operation(planner)), web.get('/api/v1/sns/repair-status', get_sns_status(planner, SnsRepairStatus)), web.get('/api/v1/sns/rebalance-status', get_sns_status(planner, SnsRebalanceStatus)), ]) self.app = app except Exception as e: raise HAConsistencyException('Failed to configure hax') from e
def is_node_alive(self, node: str) -> bool: """ Checks via Consul Members API whether the given node is alive. """ try: # Returns data of the following kind: # [{ # 'Name': 'localhost', # 'Addr': '192.168.6.214', # 'Port': 8301, # 'Tags': { # 'acls': '0', # 'bootstrap': '1', # 'build': '1.7.8:9a5a1218', # 'dc': 'dc1', # 'id': 'dd8a91f6-ca32-30e0-983c-8f309d653045', # 'port': '8300', # 'raft_vsn': '3', # 'role': 'consul', # 'segment': '', # 'vsn': '2', # 'vsn_max': '3', # 'vsn_min': '2', # 'wan_join_port': '8302' # }, # 'Status': 1, # 'ProtocolMin': 1, # 'ProtocolMax': 5, # 'ProtocolCur': 2, # 'DelegateMin': 2, # 'DelegateMax': 5, # 'DelegateCur': 4 # }] members_data = self.cns.agent.members() LOG.log(TRACE, "members: %s", members_data) for member in members_data: if member['Name'] == node: return int(member['Status']) == 1 return True except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException( 'Failed to members data from Consul') from e
def run( self, threads_to_wait: List[StoppableThread] = [], port=8008, ): self._configure() try: self._start(port) LOG.debug('Server stopped normally') except Exception as e: raise HAConsistencyException( 'Failed to start web server, trying again...') from e finally: self.hax_state.set_stopping() LOG.debug('Stopping the threads') self.planner.shutdown() for thread in threads_to_wait: thread.stop() for thread in threads_to_wait: thread.join() LOG.info('The http server has stopped')
def kv_delete_in_transaction(self, tx_payload: List[KeyDelete]) -> bool: def to_payload(v: KeyDelete) -> Dict[str, Any]: return { 'KV': { 'Key': v.name, 'Verb': 'delete-tree' if v.recurse else 'delete' } } try: self.cns.txn.put([to_payload(i) for i in tx_payload]) return True except ClientError: # If a transaction fails, Consul returns HTTP 409 with the # JSON payload describing the reason why the transaction # was rejected. # The library transforms HTTP 409 into generic ClientException. # Unfortunately, we can't easily extract the payload from it. return False except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException(f'Failed to delete key(s)' f' from KV, error: {e}')
def get_service_health(self, service_name: str, node: str, svc_id: int) -> ServiceHealth: try: node_data: List[Dict[str, Any]] = self.cns.health.node(node)[1] LOG.debug('Node Data: %s', node_data) status = ServiceHealth.UNKNOWN for item in node_data: if (item['ServiceName'] == service_name and item['ServiceID'] == str(svc_id)): if item['Status'] == 'passing': status = ServiceHealth.OK elif item['Status'] == 'warning': fid = create_process_fid(svc_id) svc_consul_status = self.get_svc_status(fid) if svc_consul_status in ('M0_CONF_HA_PROCESS_STARTING', 'M0_CONF_HA_PROCESS_STARTED'): status = ServiceHealth.OK else: status = ServiceHealth.FAILED except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to communicate ' 'to Consul Agent') from e return status
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] proc_Health_to_status = { ObjHealth.OFFLINE: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED, ObjHealth.FAILED: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED, ObjHealth.OK: m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED } try: for state in ha_states: if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ObjHealth.UNKNOWN: continue proc_status_remote = self.consul.get_process_status( state.fid) proc_status: Any = None # MKFS states are upated by the node corresponding to a # given process. So we ignore notifications for mkfs # processes. if proc_status_remote.proc_type in ( 'Unknown', m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS.name): continue proc_type = m0HaProcessType.str_to_Enum( proc_status_remote.proc_type) # Following cases are handled here, # 1. Delayed consul service failure notification: # - We re-confirm the current process state before # notifying the process as offline/failed. # 2. Consul reported process failure, current process # state is offline (this means the corresponding node # is online, i.e. hax and consul are online): # - So process's status in consul kv might not be updated # as the process died abruptly. In this case we handle # it as local process failure, update the process # status in consul kv and notify motr. # 3. Consul reported process failure, current process # state is failed (this means the node corresponding to # the process also failed, i.e. hax and consul are no # more): # - Process's status in consul kv might not be updated as # the node went down abruptly. In this case, when # consul reports failure for corresponding node # processes, Hare verifies the node status and # accordingly Hare RC node processes the failures. # This may take some time if Consul server loose # the quorum and take time sync up the state. # 4. Consul reported process failure, probably due to mkfs # process completion (m0tr mkfs and m0ds share the same # fid). which got delayed and process has starting now: # - Hare checks the current status of the process but it # is possible that the process state is not synced up # yet within the quorum. In this case, we continue # processing the failure event but once the process # starts successfully Hare will update and notify the # process state eventually. # 5. For some reason Consul may report a process as # offline and subsequently report it as online, this # may happen due to intermittent monitor failure: # - Hare must handle the change in process states # accordingly in-order to maintain the eventual # consistency of the cluster state. proc_status = proc_Health_to_status.get(current_status) LOG.debug('current_status: %s proc_status_remote: %s', current_status, proc_status_remote.proc_status) if proc_status is not None: LOG.debug('proc_status: %s', proc_status.name) if proc_status_remote.proc_status != proc_status.name: if (self.consul.am_i_rc() or self.consul.is_proc_local(state.fid)): # Probably process node failed, in such a # case, only RC must be allowed to update # the process's persistent state. # Or, if the node's alive then allow the node # to update the local process's state. self.consul.update_process_status( ConfHaProcess(chp_event=proc_status, chp_type=proc_type, chp_pid=0, fid=state.fid)) # RC or not RC, i.e. even without persistent state # update, it is important that the notification to # local motr processes must still be sent. new_ha_states.append( HAState(fid=state.fid, status=current_status)) if not self.consul.is_proc_local(state.fid): proc_status_local = ( self.consul.get_process_status_local( state.fid)) # Consul monitors a process every 1 second and # this notification is sent to every node. Thus # to avoid notifying about a process multiple # times about the same status every node # maintains a local copy of the remote process # status, which is checked everytime a consul # notification is received and accordingly # the status is notified locally to all the local # motr processes. if (proc_status_local.proc_status != proc_status.name): self.consul.update_process_status_local( ConfHaProcess(chp_event=proc_status, chp_type=proc_type, chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: continue else: new_ha_states.append(state) except Exception as e: raise HAConsistencyException('failed to process ha states') from e return new_ha_states
def _get_service_health(self, service_name: str) -> List[Dict[str, Any]]: try: return self.cns.health.service(service_name)[1] except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to communicate ' 'to Consul Agent') from e
def _catalog_service_get(self, svc_name: str) -> List[Dict[str, Any]]: try: return self.cns.catalog.service(service=svc_name)[1] except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException( 'Could not access Consul Catalog') from e
def get_service_health(self, node: str, svc_id: int) -> ServiceHealth: """ Returns current status of a Consul service identified by the given svc_id for a given node. """ # Maps consul service status and motr process status to the # corresponding ha status to be notified. # Respective values are for local and remote nodes. # {(consul_svc_status, motr_process_status):(local_node_ha_status, # remote_node_ha_status)} cur_consul_status = MotrConsulProcStatus local_remote_health_ret = MotrProcStatusLocalRemote svc_to_motr_status_map = { cur_consul_status('passing', 'M0_CONF_HA_PROCESS_STARTING'): local_remote_health_ret(ServiceHealth.OFFLINE, ServiceHealth.OK), cur_consul_status('passing', 'M0_CONF_HA_PROCESS_STOPPING'): local_remote_health_ret(ServiceHealth.OFFLINE, ServiceHealth.UNKNOWN), cur_consul_status('passing', 'M0_CONF_HA_PROCESS_STARTED'): local_remote_health_ret(ServiceHealth.OK, ServiceHealth.OK), cur_consul_status('passing', 'M0_CONF_HA_PROCESS_STOPPED'): local_remote_health_ret(ServiceHealth.OFFLINE, ServiceHealth.UNKNOWN), cur_consul_status('passing', 'Unknown'): local_remote_health_ret(ServiceHealth.UNKNOWN, ServiceHealth.UNKNOWN), cur_consul_status('warning', 'M0_CONF_HA_PROCESS_STOPPING'): local_remote_health_ret(ServiceHealth.OFFLINE, ServiceHealth.STOPPED), cur_consul_status('warning', 'M0_CONF_HA_PROCESS_STARTED'): local_remote_health_ret(ServiceHealth.OFFLINE, ServiceHealth.FAILED), cur_consul_status('warning', 'M0_CONF_HA_PROCESS_STOPPED'): local_remote_health_ret(ServiceHealth.STOPPED, ServiceHealth.STOPPED), cur_consul_status('warning', 'M0_CONF_HA_PROCESS_STARTING'): local_remote_health_ret(ServiceHealth.OFFLINE, ServiceHealth.OFFLINE), cur_consul_status('warning', 'Unknown'): local_remote_health_ret(ServiceHealth.UNKNOWN, ServiceHealth.UNKNOWN) } try: node_data: List[Dict[str, Any]] = self.cns.health.node(node)[1] if not node_data: return ServiceHealth.FAILED node_status = str(node_data[0]['Status']) if node_status != 'passing' or (not self.is_node_alive(node)): return ServiceHealth.FAILED status = ServiceHealth.UNKNOWN for item in node_data: if item['ServiceID'] == str(svc_id): LOG.debug('item.status %s', item['Status']) if item['Status'] == 'critical': return ServiceHealth.FAILED pfid = create_process_fid(svc_id) cns_status = self.get_process_status(pfid) svc_health = svc_to_motr_status_map[MotrConsulProcStatus( item['Status'], cns_status.proc_status)] LOG.debug('consul.status %s svc_health: %s', cns_status, svc_health) local_node = self.get_local_nodename() proc_node = self.get_process_node(pfid) if proc_node == local_node: status = svc_health.motr_proc_status_local else: status = svc_health.motr_proc_status_remote if (status == ServiceHealth.FAILED and cns_status.proc_type == (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS.name)): status = ServiceHealth.STOPPED # This situation is not expected but we handle # the same. Hax may end up here if the process has stopped # already and its current status is also reported as # 'unknown' by Consul. Hax will do nothing in this case # and will report OFFLINE for that process. if (item['Status'] == 'warning' and cns_status.proc_status == 'Unknown' and status == ServiceHealth.UNKNOWN): status = ServiceHealth.OFFLINE return status except (ConsulException, HTTPError, RequestException) as e: raise HAConsistencyException('Failed to communicate ' 'to Consul Agent') from e return status