def ws_request(self, url, method, retry_count=MAX_RETRIES, post_data=""): """Make webservice requests using common utils""" response = None retried_login = False need_relogin = False tried_alt_ip = False while retry_count: if tried_alt_ip: # Extract show fru name from old URL to update alternative IP. url = self.build_url(url[url.index('/api/'):].replace('/api','')) response = self.ws.ws_request(method, url, self.common_reqheaders, post_data, self.WEBSERVICE_TIMEOUT) retry_count -= 1 if response is None: continue self.ws_response_status = response.status_code if response.status_code == self.ws.HTTP_OK: self.mc_timeout_counter = 0 try: jresponse = json.loads(response.content) #TODO: Need a way to check return-code 2 in more optimal way if possible, # currently being checked for all http 200 responses if jresponse: if jresponse['status'][0]['return-code'] == self.CLIAPI_RESP_FAILURE: response_status = jresponse['status'][0]['response'] # if call fails with invalid session key request # seen in G280 fw version if self.CLIAPI_RESP_INVSESSION in response_status: need_relogin = True except ValueError as badjson: logger.error("%s returned mal-formed json:\n%s" % (url, badjson)) # http 403 forbidden request, login & retry elif (response.status_code == self.ws.HTTP_FORBIDDEN or \ need_relogin) and retried_login is False: logger.info("%s failed, retrying after login " % (url)) self.login() retried_login = True need_relogin = False continue elif (response.status_code == self.ws.HTTP_TIMEOUT or \ response.status_code == self.ws.HTTP_CONN_REFUSED or \ response.status_code == self.ws.HTTP_NO_ROUTE_TO_HOST) \ and tried_alt_ip is False: self.switch_to_alt_mc() tried_alt_ip = True self.mc_timeout_counter += 1 continue break return response
def __init__(self): super(RealStorEnclosure, self).__init__() # WS Request common headers self.ws = WebServices() self.common_reqheaders = {} self.encl_conf = self.CONF_SECTION_MC self.system_persistent_cache = self.encl_cache + "system/" self.faults_persistent_cache = self.system_persistent_cache + "faults.json" # Read in mc value from configuration file self.mc1 = self.conf_reader._get_value_with_default( self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("primary_controller_ip"), self.DEFAULT_MC_IP) self.mc1_wsport = self.conf_reader._get_value_with_default( self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("primary_controller_port"), '') self.mc2 = self.conf_reader._get_value_with_default( self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("secondary_controller_ip"), self.DEFAULT_MC_IP) self.mc2_wsport = self.conf_reader._get_value_with_default( self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("secondary_controller_port"), '') self.active_ip = self.mc1 self.active_wsport = self.mc1_wsport self.user = self.conf_reader._get_value_with_default( self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("user"), self.DEFAULT_USER) self.passwd = self.conf_reader._get_value_with_default( self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("password"), self.DEFAULT_PASSWD) self.mc_interface = self.conf_reader._get_value_with_default( self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("mgmt_interface"), "cliapi") self.pollfreq = int(self.conf_reader._get_value_with_default( self.CONF_REALSTORSENSORS, "polling_frequency", self.DEFAULT_POLL)) self.site_id = self.conf_reader._get_value_with_default( self.SYSTEM_INFORMATION, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.SITE_ID), '001') self.rack_id = self.conf_reader._get_value_with_default( self.SYSTEM_INFORMATION, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.RACK_ID), '001') self.node_id = self.conf_reader._get_value_with_default( self.SYSTEM_INFORMATION, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.NODE_ID), '001') # Need to keep cluster_id string here to generate decryption key self.cluster_id = self.conf_reader._get_value_with_default( self.SYSTEM_INFORMATION, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.CLUSTER_ID), '001') # Decrypt MC Password decryption_key = encryptor.gen_key(self.cluster_id, ServiceTypes.STORAGE_ENCLOSURE.value) self.passwd = encryptor.decrypt(decryption_key, self.passwd.encode('ascii'), "RealStoreEncl") if self.mc_interface not in self.realstor_supported_interfaces: logger.error("Unspported Realstor interface configured," " monitoring and alerts generation may hamper") return # login to mc to get session key, required for querying resources # periodically self.login()
def run(self): logger.info(f"Monitoring Services : {self.services_to_monitor}") try: # Register all the services to signal of 'PropertiesChanged' and # raise an alert if some service is not active on initially or if # Unit is not found for the service services_to_monitor_copy = copy.deepcopy(self.services_to_monitor) for service in services_to_monitor_copy: err = self.connect_to_prop_changed_signal(service) if err: self.raise_alert(service, "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", 0) logger.error( f"{service} is not active initially. \n Error {err}") else: self.services_to_monitor.remove(service) logger.debug(f"failed_services : {self.failed_services}") logger.debug(f"services_to_monitor : {self.services_to_monitor}") # Retrieve the main loop which will be called in the run method self._loop = GLib.MainLoop() # Initialize the gobject threads and get its context GLib.threads_init() context = self._loop.get_context() time_to_check_lists = self.current_time() + self.polling_frequency # WHILE LOOP FUNCTION : every second we check for # properties change event if any generated (using context # iteration) and after a delay of polling frequency we # check for inactive processes. while self.is_running(): # At interval of 'thread_sleep' check for events occured for # registered services and process them(call on_pro_changed()) context.iteration(False) time.sleep(self.thread_sleep) # At interval of 'polling_freqency' process unregistered # services and services with not-active (intermidiate) state. if time_to_check_lists <= self.current_time(): time_to_check_lists = self.current_time() + \ self.polling_frequency # Try to bind the enabled services on the node to the # signal whose Unit was earlier not found. On successfully # registering for service state change signal, remove from # local list as monitoring enabled through SystemD # and to avoid re-registration. services_to_monitor_copy = copy.deepcopy( self.services_to_monitor) for service in services_to_monitor_copy: if not self.connect_to_prop_changed_signal(service): self.services_to_monitor.remove(service) # Check for services in intermidiate state(not active) self.check_notactive_services() logger.info("ServiceMonitor gracefully breaking out " +\ "of dbus Loop, not restarting.") except GLib.Error as err: raise ThreadException( self.SENSOR_NAME, "Ungrecefully breaking out of GLib.MainLoop() with error: %s" % err) except DBusException as err: raise ThreadException( self.SENSOR_NAME, "Ungracefully breaking out of dbus loop with error: %s" % err) except Exception as err: raise ThreadException(self.SENSOR_NAME, "Ungracefully breaking out of ServiceMonitor:run() "\ "with error: %s" % err)
def _put_enclosure_action(self, ctrl_action, ctrl_type, resource, enclosure_request): severity = "informational" message = "request performed successfully." invalid_args = False ctrl_cmd = "" action_type = "" if resource.startswith(("controller_a", "controller_b")): ctrl_name = resource.split("controller_")[1].split(':')[0] elif resource == "*": ctrl_name = "both" else: ctrl_name = "" invalid_args = True if ctrl_action == "shutdown": ctrl_cmd = f"{ctrl_action} {ctrl_name}" action_type = ctrl_action else: ctrl_cmd = f"{ctrl_action} {ctrl_type} {ctrl_name}" action_type = f"{ctrl_action} {ctrl_type}" if (ctrl_action == "restart" and not resource.endswith(('sc', 'mc', "*"))) \ or (ctrl_action == "shutdown" and resource.endswith(('sc', 'mc'))) or \ not ctrl_cmd.endswith((' a', ' b', ' both')): # Checking/validating that we are passing appropriate/correct # controller with "ctrl_cmd" # Example : shutdown <a|b|both> or restart sc <a|b|both> invalid_args = True if invalid_args: # Invalid resource 'shutdown abc' for an # 'ENCL: enclosure:fru:controller:shutdown' actuator request err_msg = "Invalid resource '{}' for an '{}' actuator request".format( resource, enclosure_request) logger.error(err_msg) raise Exception(err_msg) encl_response = self._get_encl_response( self.rssencl.URI_CLIAPI_BASE + ctrl_cmd.replace(' ', '/'), self.rssencl.ws.HTTP_GET ) if encl_response == {}: # The control request not performed successfully. # Got HTTP Status != 200 or return_code != 0. severity = "warning" message = "request failed. Please try again...!" # Some info will be deleted from the "response_str" in the "make_response" # method which is not required further. We are keeping this to decide some # fields in the "make_response" method. response_str = { 'shutdown' : { 'message': 'Shutdown %s' % (message), 'description': 'Shuts down the Storage Controller in a controller ' 'module. This ensures that a proper failover sequence is used, ' 'which includes stopping all I/O operations and writing any ' 'data in write cache to disk.', 'command': ctrl_cmd, 'alert_type': 'control:shutdown', 'severity': severity, 'resource_type': 'enclosure:fru:controller' }, 'restart sc' : { 'message': 'Restart / Start Storage Controller %s' % (message), 'description' : 'Restarts the Storage Controller in a controller ' 'module. When you restart a Storage Controller, it attempts ' 'to shut down with a proper failover sequence, which includes ' 'stopping all I/O operations and flushing the write cache to ' 'disk, and then the Storage Controller restarts. Restarting a ' 'Storage Controller restarts the corresponding Management ' 'Controller.', 'command': ctrl_cmd, 'alert_type': 'control:restart', 'severity': severity, 'resource_type': 'enclosure:fru:controller:sc' }, 'restart mc' : { 'message' : 'Restart / Start Management Controller %s' % (message), 'description' : 'Restarts the Management Controller in a ' 'controller module. When you restart a Management Controller,' ' communication with it is lost until it successfully ' 'restarts. If the restart fails, the partner Management ' 'Controller remains active with full ownership of operations ' 'and configuration information.', 'command': ctrl_cmd, 'alert_type': 'control:restart', 'severity': severity, 'resource_type': 'enclosure:fru:controller:mc' } } return response_str[action_type]
def perform_request(self, jsonMsg): """Performs the RealStor enclosure request @return: The response string from performing the request """ response = "N/A" try: enclosure_request = jsonMsg.get("actuator_request_type").get("storage_enclosure").get("enclosure_request") enclosure_request_data = [ s.strip() for s in enclosure_request.split(":")] ctrl_action = "" ctrl_type = "" if enclosure_request_data[-1] == "shutdown": # "ENCL: enclosure:fru:controller:shutdown" (request_type, _, component, component_type, ctrl_action) = enclosure_request_data elif enclosure_request_data[-1] == "restart": # "ENCL: enclosure:fru:controller:sc:restart" # "ENCL: enclosure:fru:controller:mc:restart" (request_type, _, component, component_type, ctrl_type, ctrl_action) = enclosure_request_data else: # "ENCL: enclosure:fru:controller" (request_type, _, component, component_type) = \ enclosure_request_data resource = jsonMsg.get("actuator_request_type").get("storage_enclosure").get("resource") if ctrl_action in self.CTRL_ACTION_LST: response = self.make_response( self._put_enclosure_action(ctrl_action, ctrl_type, resource.strip(), enclosure_request), component, component_type, resource, ctrl_action = ctrl_action) elif component == "fru": response = self.make_response(self.request_fru_func[ request_type][component_type](resource), component, component_type, resource) elif component == "sensor": response = self.make_response( self._get_sensor_data(sensor_type=component_type, sensor_name=resource), component, component_type, resource) elif component == "interface": enclosure_type = enclosure_request.split(":")[2] if enclosure_type == ResourceTypes.INTERFACE.value: response = self._handle_ports_request(enclosure_request, resource) else: logger.error("Some unsupported interface passed, interface:{}".format(enclosure_type)) elif component == "system": if component_type == 'info': response = self.make_response( self._get_system_info(), component, component_type, resource) else: logger.error("Unsupported system request :{}".format(enclosure_request)) except Exception as e: logger.exception("Error while getting details for JSON: {}".format(jsonMsg)) response = {"Error": e} return response
def _read_config(self): """Configure the RabbitMQ exchange with defaults available""" try: self._virtual_host = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.VIRT_HOST, 'SSPL') # Read common RabbitMQ configuration self._primary_rabbitmq_host = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.PRIMARY_RABBITMQ_HOST, 'localhost') # Read RabbitMQ configuration for sensor messages self._queue_name = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.QUEUE_NAME, 'sensor-queue') self._exchange_name = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.EXCHANGE_NAME, 'sspl-out') self._routing_key = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.ROUTING_KEY, 'sensor-key') # Read RabbitMQ configuration for Ack messages self._ack_queue_name = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.ACK_QUEUE_NAME, 'sensor-queue') self._ack_routing_key = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.ACK_ROUTING_KEY, 'sensor-key') self._username = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.USER_NAME, 'sspluser') self._password = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.PASSWORD, '') self._signature_user = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.SIGNATURE_USERNAME, 'sspl-ll') self._signature_token = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.SIGNATURE_TOKEN, 'FAKETOKEN1234') self._signature_expires = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.SIGNATURE_EXPIRES, "3600") self._iem_route_addr = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.IEM_ROUTE_ADDR, '') self._iem_route_exchange_name = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.IEM_ROUTE_EXCHANGE_NAME, 'sspl-in') cluster_id = self._conf_reader._get_value_with_default(self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(self.CLUSTER_ID_KEY), '') # Decrypt RabbitMQ Password decryption_key = encryptor.gen_key(cluster_id, ServiceTypes.RABBITMQ.value) self._password = encryptor.decrypt(decryption_key, self._password.encode('ascii'), "RabbitMQegressProcessor") if self._iem_route_addr != "": logger.info(" Routing IEMs to host: %s" % self._iem_route_addr) logger.info(" Using IEM exchange: %s" % self._iem_route_exchange_name) except Exception as ex: logger.error("RabbitMQegressProcessor, _read_config: %r" % ex)
def connect_to_prop_changed_signal(self, service): """ Bind the service to a signal('PropertiesChanged'). Fetch the service unit from systemd and its state, substate, pid etc. Bind the service to the sigle which will be triggered whenever the service changes it's state/substate. Also raise an alert if service is in failed/inactive state. """ try: unit, _, state, substate, pid = self.get_service_status( service=service) if service in self.service_status: prev_state = self.service_status[service]['state'] prev_substate = self.service_status[service]['substate'] prev_pid = self.service_status[service]['pid'] else: prev_state = prev_substate = prev_pid = "N/A" self.update_status_local_cache(service, state, substate, pid) Iunit2 = Interface( unit, dbus_interface='org.freedesktop.systemd1.Manager') Iunit2.connect_to_signal( 'PropertiesChanged', lambda a, b, c, p=unit: self.on_prop_changed(a, b, c, p), dbus_interface=PROPERTIES_IFACE) logger.debug(f"{service}({pid}) state is {state}:{substate}") if state in ["activating", "reloading", "deactivating"]: if service not in self.not_active_services: self.not_active_services[service] = \ [self.current_time(), state, substate] elif state != "active" and service not in self.failed_services: self.raise_alert(service, prev_state, state, prev_substate, substate, prev_pid, pid, 0) if service in self.not_active_services: self.not_active_services.pop(service) self.failed_services.append(service) logger.error( f"{service} is not active initially. state = {state}:{substate}" ) elif state == "active": if service in self.failed_services: self.raise_alert(service, prev_state, state, prev_substate, substate, prev_pid, pid, 2) self.failed_services.remove(service) logger.info( f"{service} returned to good state. state = {state}:{substate}" ) if service in self.not_active_services: self.not_active_services.pop(service) self.update_persistent_cache() return None except DBusException as err: return err
def _generate_host_update(self): """Create & transmit a host update message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the host_update message successful = self._node_sensor.read_data("host_update", self._get_debug(), self._units) if not successful: logger.error("NodeDataMsgHandler, _generate_host_update was NOT successful.") self._host_memory_usage_threshold = str(self._host_memory_usage_threshold) try: if self._host_memory_usage_threshold.isdigit(): self._host_memory_usage_threshold = int(self._host_memory_usage_threshold) else: self._host_memory_usage_threshold = float(self._host_memory_usage_threshold) except ValueError: logger.warning("Host Memory Alert, Invalid host_memory_usage_threshold value are entered in config.") # Assigning default value to _disk_usage_threshold self._host_memory_usage_threshold = self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD if self._node_sensor.total_memory["percent"] >= self._host_memory_usage_threshold: # Create the disk space data message and hand it over to the egress processor to transmit if not self.host_fault: self.host_fault = True # Create the disk space data message and hand it over to the egress processor to transmit fault_event = "Host memory usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold) logger.warning(fault_event) logged_in_users = [] # Create the host update message and hand it over to the egress processor to transmit hostUpdateMsg = HostUpdateMsg(self._node_sensor.host_id, self._epoch_time, self._node_sensor.boot_time, self._node_sensor.up_time, self._node_sensor.uname, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self._node_sensor.total_memory, self._node_sensor.logged_in_users, self._node_sensor.process_count, self._node_sensor.running_process_count, self.FAULT, fault_event ) # Add in uuid if it was present in the json request if self._uuid is not None: hostUpdateMsg.set_uuid(self._uuid) jsonMsg = hostUpdateMsg.getJson() # Transmit it out over rabbitMQ channel self.host_sensor_data = jsonMsg self.os_sensor_type["memory_usage"] = self.host_sensor_data self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.total_memory["percent"] < self._host_memory_usage_threshold) and (self.host_fault == True): fault_resolved_event = "Host memory usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold) logger.warning(fault_resolved_event) logged_in_users = [] # Create the host update message and hand it over to the egress processor to transmit hostUpdateMsg = HostUpdateMsg(self._node_sensor.host_id, self._epoch_time, self._node_sensor.boot_time, self._node_sensor.up_time, self._node_sensor.uname, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self._node_sensor.total_memory, self._node_sensor.logged_in_users, self._node_sensor.process_count, self._node_sensor.running_process_count, self.FAULT_RESOLVED, fault_resolved_event ) # Add in uuid if it was present in the json request if self._uuid is not None: hostUpdateMsg.set_uuid(self._uuid) jsonMsg = hostUpdateMsg.getJson() # Transmit it out over rabbitMQ channel self.host_sensor_data = jsonMsg self.os_sensor_type["memory_usage"] = self.host_sensor_data self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.host_fault = False
def _generate_cpu_data(self): """Create & transmit a cpu_data message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the cpu_data message successful = self._node_sensor.read_data("cpu_data", self._get_debug()) if not successful: logger.error("NodeDataMsgHandler, _generate_cpu_data was NOT successful.") self._cpu_usage_threshold = str(self._cpu_usage_threshold) try: if self._cpu_usage_threshold.isdigit(): self._cpu_usage_threshold = int(self._cpu_usage_threshold) else: self._cpu_usage_threshold = float(self._cpu_usage_threshold) except ValueError: logger.warning("CPU Usage Alert, Invalid host_memory_usage_threshold value are entered in config.") # Assigning default value to _cpu_usage_threshold self._cpu_usage_threshold = self.DEFAULT_CPU_USAGE_THRESHOLD if self._node_sensor.cpu_usage >= self._cpu_usage_threshold: if not self.cpu_fault : self.cpu_fault = True # Create the cpu usage data message and hand it over to the egress processor to transmit fault_event = "CPU usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.cpu_usage, self._cpu_usage_threshold) logger.warning(fault_event) # Create the local mount data message and hand it over to the egress processor to transmit cpuDataMsg = CPUdataMsg(self._node_sensor.host_id, self._epoch_time, self._node_sensor.csps, self._node_sensor.idle_time, self._node_sensor.interrupt_time, self._node_sensor.iowait_time, self._node_sensor.nice_time, self._node_sensor.softirq_time, self._node_sensor.steal_time, self._node_sensor.system_time, self._node_sensor.user_time, self._node_sensor.cpu_core_data, self._node_sensor.cpu_usage, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT, fault_event ) # Add in uuid if it was present in the json request if self._uuid is not None: cpuDataMsg.set_uuid(self._uuid) jsonMsg = cpuDataMsg.getJson() self.cpu_sensor_data = jsonMsg self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.cpu_usage <= self._cpu_usage_threshold) and (self.cpu_fault == True): # Create the cpu usage data message and hand it over to the egress processor to transmit fault_resolved_event = "CPU usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.cpu_usage, self._cpu_usage_threshold) logger.warning(fault_resolved_event) # Create the local mount data message and hand it over to the egress processor to transmit cpuDataMsg = CPUdataMsg(self._node_sensor.host_id, self._epoch_time, self._node_sensor.csps, self._node_sensor.idle_time, self._node_sensor.interrupt_time, self._node_sensor.iowait_time, self._node_sensor.nice_time, self._node_sensor.softirq_time, self._node_sensor.steal_time, self._node_sensor.system_time, self._node_sensor.user_time, self._node_sensor.cpu_core_data, self._node_sensor.cpu_usage, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT_RESOLVED, fault_resolved_event ) # Add in uuid if it was present in the json request if self._uuid is not None: cpuDataMsg.set_uuid(self._uuid) jsonMsg = cpuDataMsg.getJson() self.cpu_sensor_data = jsonMsg self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.cpu_fault = False
def rss_cliapi_poll_disks(self, disk): """Retreive realstor disk info using cli api /show/disks""" # make ws request url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWDISKS) if (disk != self.RSS_DISK_GET_ALL): diskId = disk.partition("0.")[2] if (diskId.isdigit()): url = f"{url}/{disk}" url = f"{url}/detail" response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET) if not response: logger.warn( f"{self.rssencl.LDR_R1_ENCL}:: Disks status unavailable as ws request {url} failed" ) return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: logger.error( f"{self.rssencl.LDR_R1_ENCL}:: http request {url} to poll disks failed with \ err {response.status_code}") return try: jresponse = json.loads(response.content) except ValueError as badjson: logger.error(f"{url} returned mal-formed json:\n{badjson}") if jresponse: api_resp = self.rssencl.get_api_status(jresponse['status']) #logger.debug("%s api response:%d" % (url.format(),api_resp)) if ((api_resp == -1) and (response.status_code == self.rssencl.ws.HTTP_OK)): logger.warn("/show/disks api response unavailable, " "marking success as http code is 200") api_resp = 0 if api_resp == 0: drives = jresponse['drives'] # reset latest drive cache to build new self.latest_disks = {} self.invalidate_latest_disks_info = False for drive in drives: slot = drive.get("slot", -1) sn = drive.get("serial-number", "NA") health = drive.get("health", "NA") if slot != -1: self.latest_disks[slot] = { "serial-number": sn, "health": health } #dump drive data to persistent cache dcache_path = f"{self.disks_prcache}disk_{slot}.json" # If drive is replaced, previous drive info needs # to be retained in disk_<slot>.json.prev file and # then only dump new data to disk_<slot>.json path_exists, ret_val = store.exists(dcache_path) if path_exists and ret_val == "Success": prevdrive = store.get(dcache_path) if prevdrive is not None: prevsn = prevdrive.get("serial-number", "NA") prevhealth = prevdrive.get("health", "NA") if prevsn != sn or prevhealth != health: # Rename path store.put(store.get(dcache_path), dcache_path + ".prev") store.delete(dcache_path) store.put(drive, dcache_path) elif not path_exists and ret_val == "Success": store.put(drive, dcache_path) else: # Invalidate latest disks info if persistence store error encountered logger.warn( f"store.exists {dcache_path} return value {ret_val}" ) self.invalidate_latest_disks_info = True break if self.invalidate_latest_disks_info is True: # Reset latest disks info self.latest_disks = {} #If no in-memory cache, build from persistent cache if not self.memcache_disks: self._rss_build_disk_cache_from_persistent_cache() # if no memory cache still if not self.memcache_disks: self.memcache_disks = self.latest_disks
def _process_msg(self, body): """Parses the incoming message and hands off to the appropriate module""" self._log_debug("_process_msg, body: %s" % body) ingressMsg = {} try: if isinstance(body, dict) is False: ingressMsg = json.loads(body) else: ingressMsg = body # Authenticate message using username and signature fields username = ingressMsg.get("username") signature = ingressMsg.get("signature") message = ingressMsg.get("message") assert (username is not None) assert (signature is not None) assert (message is not None) msg_len = len(message) + 1 if SSPL_SEC.sspl_verify_message(msg_len, str(message), username, signature) != 0: logger.error( "Authentication failed on message: %s" % ingressMsg) return # We're acting as HAlon so ignore actuator_requests # and sensor_requests messages if message.get("actuator_request_type") is not None or \ message.get("sensor_request_type") is not None: return # Get the message type msgType = message.get("actuator_response_type") # If it's an incoming actuator msg then validate against # Actuator Response schema if msgType is not None: validate(ingressMsg, self._actuator_schema) if msgType is None: msgType = message.get("sensor_response_type") validate(ingressMsg, self._sensor_schema) # Ignore drive status messages when thread starts up during tests if message.get("sensor_response_type").get( "disk_status_drivemanager") is not None: return # If the message comes from other SSPL hosts, do not pass that # message to internal queue. This happens as SSPL instances are # listening to common queues in a RabbitMQ cluster. if 'host_id' in msgType and socket.getfqdn() != msgType['host_id']: return # Write to the msg queue so the lettuce tests can # retrieve it and examine for accuracy during automated testing self._write_internal_msgQ("RabbitMQingressProcessorTests", message) except Exception as ex: logger.exception( "_process_msg unrecognized message: %r" % ingressMsg)
def _rss_check_disks_presence(self): """Match cached realstor disk info with latest retrieved disks info """ self.rss_cliapi_poll_disks(self.RSS_DISK_GET_ALL) if not self.memcache_disks: if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK: logger.warn( "Last polled drives info in-memory cache " "unavailable , unable to check drive presence change") return if not self.latest_disks: if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK: logger.warn( "Latest polled drives info in-memory cache " "unavailable, unable to check drive presence change") return # keys are disk slot numbers removed_disks = set(self.memcache_disks.keys()) - set( self.latest_disks.keys()) inserted_disks = set(self.latest_disks.keys()) - set( self.memcache_disks.keys()) # get populated slots in both caches populated = set(self.memcache_disks.keys()) & set( self.latest_disks.keys()) # check for replaced disks for slot in populated: if self.memcache_disks[slot]['serial-number'] != self.latest_disks[ slot]['serial-number']: if slot not in removed_disks: removed_disks.add(slot) if slot not in inserted_disks: inserted_disks.add(slot) # If no difference seen between cached & latest set of disk list, # means no disk removal or insertion happened if not (removed_disks or inserted_disks): #logger.info("Disk presence state _NOT_ changed !!!") return self._event = Event() for slot in removed_disks: #get removed drive data from disk cache disk_datafile = f"{self.disks_prcache}disk_{slot}.json.prev" path_exists, _ = store.exists(disk_datafile) if not path_exists: disk_datafile = f"{self.disks_prcache}disk_{slot}.json" disk_info = store.get(disk_datafile) #raise alert for missing drive self._rss_raise_disk_alert(self.rssencl.FRU_MISSING, disk_info) # Wait till msg is sent to rabbitmq or added in consul for resending. # If timed out, do not update cache if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.delete(disk_datafile) self._event.clear() self._event = None for slot in inserted_disks: #get inserted drive data from disk cache disk_info = store.get(f"{self.disks_prcache}disk_{slot}.json") #raise alert for added drive self._rss_raise_disk_alert(self.rssencl.FRU_INSERTION, disk_info) # Update health status for inserted disk in memfault cache, # to raise fault alert after insertion if inserted disk status is not OK. if disk_info["health"] != "OK": for id_fault, cached_fault in enumerate( self.rssencl.memcache_faults): #fetch disk slot from component_id present in memcache_faults. try: component_id = cached_fault["component-id"] if component_id.startswith('Disk 0'): disk_id = int(cached_fault["component-id"].split() [1].split('.')[1]) if disk_id == slot: self.rssencl.memcache_faults[id_fault][ 'health'] = "OK" except Exception as e: logger.error(f"Error in updating health status for \ inserted disk in memfault cache {e}") # Update cached disk data after comparison self.memcache_disks = self.latest_disks self.rssencl.memcache_frus.update({"disks": self.memcache_disks}) return
def _send_msg(self, iem_components, log_timestamp): """Creates JSON message from iem components and sends to message bus. """ impact = "NA" recommendation = "NA" # IEM format is IEC:DESCRIPTION # IEC format is SEVERITY|SOURCEID|COMPONENTID|MODULEID|EVENTID # Field lengths ----1---|---1----|------3----|----3---|---4--- # Example IEM -> "IEC: BO1001000001:Error in connecting to controller" # Actual IEC doesn't contain separator between fields. It is shown # here just for readability. Each field has fixed length. severity, source_id, component_id, module_id, event_id, description = \ [iem_components[i] for i in range(6)] # Check if severity level is valid if severity not in self.SEVERITY_LEVELS: logger.warn(f"Invalid Severity level: {severity}") return # Check for valid source id if source_id not in self.SOURCE_IDS: logger.warn(f"Invalid Source ID level: {source_id}") return # Check for valid event time event_time = self._get_epoch_time_from_timestamp(log_timestamp) if not event_time: logger.error( "Timestamp is not in required format, discarding the message") return # Check for other components args = { "_comp_id": component_id, "_module_id": module_id, "_event_id": event_id } if not self._are_components_in_range(**args): return # component-id for sspl=005 if component_id == "005": event_code = component_id + module_id + event_id impact = Iem().EVENT_STRING[event_code][1] recommendation = Iem().EVENT_STRING[event_code][2] # Update severity and source_id alert_type = iem_severity_to_alert_mapping.get(severity) severity = iem_severity_types.get(severity, severity) source_id = iem_source_types.get(source_id, source_id) # Decode component_id, module_id and event_id component_id, module_id, event_id = self._decode_msg( f"{component_id}{module_id}{event_id}") info = { "site_id": self._site_id, "rack_id": self._rack_id, "node_id": self._node_id, "cluster_id": self._cluster_id, "source_id": source_id, "component_id": component_id, "module_id": module_id, "event_id": event_id, "severity": severity, "description": description, "impact": impact, "recommendation": recommendation, "alert_type": alert_type, "event_time": event_time, "IEC": "".join(iem_components[:-1]) } iem_data_msg = IEMDataMsg(info) json_msg = iem_data_msg.getJson() self._write_internal_msgQ(EgressProcessor.name(), json_msg)
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(SASPortSensor, self).initialize(conf_reader) super(SASPortSensor, self).initialize_msgQ(msgQlist) self._site_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{self.SITE_ID}",'DC01') self._rack_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{self.RACK_ID}",'RC01') self._node_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{self.NODE_ID}",'SN01') self._cluster_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{self.CLUSTER_ID}",'CC01') # Get the sas port implementor from configuration sas_port_utility = Conf.get(SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs") self.polling_interval = int(Conf.get(SSPL_CONF, f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}", self.DEFAULT_POLLING_INTERVAL)) # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.SAS_PORT_SENSOR_DATA = os.path.join(cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}') alert_type = None try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(sas_port_utility) self._utility_instance.initialize() phy_status = None link_value_phy_status_collection = () # Call to sas phy dirctory which will return a dictionary # which has phy_name to negotiated link rate mapping # Ex: {"phy-0:0": "<12.0, Unknown>"} self.phy_dir_to_linkrate_mapping = \ self._utility_instance.get_phy_negotiated_link_rate() # Iterate over populated dictionary and restructure it # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP. # {"phy-0:0": ("link_rate", <Up/Down>)} for phy, value in self.phy_dir_to_linkrate_mapping.items(): if 'Gbit'.lower() in value.strip().lower(): phy_status = 'up' # Increment global phy_link count for UP status self.phy_link_count += 1 else: phy_status = 'fault' link_value_phy_status_collection = (value, phy_status) self.phy_dir_to_linkrate_mapping[phy] = link_value_phy_status_collection # Get the stored previous alert info self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA) self.check_and_send_alert() except KeyError as key_error: logger.error( "Unable to get the instance of {} \ Utility. Hence shutting down the sensor".format(sas_port_utility)) self.shutdown() except Exception as e: if e == errno.ENOENT: logger.error( "Problem occured while reading from sas_phy \ directory. directory path doesn't directory. Hence \ shuting down the sensor") elif e == errno.EACCES: logger.error( "Problem occured while reading from sas_phy directory. \ Not enough permission to read from the directory. \ Hence shuting down the sensor") else: logger.error( "Problem occured while reading from sas_phy directory. \ {0}. Hence shuting down the sensor".format(e)) self.shutdown() return True
def get_system_status(self): """Retreive realstor system state info using cli api /show/system""" # poll system would get invoked through multiple realstor sensors # with less frequency compared to configured polling frequency # adding check to comply with polling frequency elapsed = time.time() - self.poll_system_ts if elapsed < self.pollfreq: logger.warn("/show/system request came in {0} seconds," "while configured polling frequency is {1} seconds," "ignoring".format(elapsed, self.pollfreq)) return system = None # make ws request url = self.build_url(self.URI_CLIAPI_SHOWSYSTEM) #logger.info("show system url: %s" % url) response = self.ws_request(url, self.ws.HTTP_GET) if not response: logger.warn("System status unavailable as ws request failed") return if response.status_code != self.ws.HTTP_OK: logger.info("{0}:: http request {1} polling system status failed" " with http err {2}".format(self.LDR_R1_ENCL, url, \ response.status_code)) return self.poll_system_ts = time.time() try: jresponse = json.loads(response.content) except ValueError as badjson: logger.error("%s returned mal-formed json:\n%s" % (url, badjson)) if jresponse: api_resp = self.get_api_status(jresponse['status']) if ((api_resp == -1) and (response.status_code == self.ws.HTTP_OK)): logger.warn("/show/system api response unavailable, " "marking success as http code is 200") api_resp = 0 if api_resp == 0: system = jresponse['system'][0] self.memcache_system = system if system: # Check if fault exists # TODO: use self.FAULT_KEY in system: system.key() generates # list and find item in that. if not self.FAULT_KEY in system.keys(): logger.debug("{0} Healthy, no faults seen".format(self.LDR_R1_ENCL)) self.latest_faults = {} return # Extract system faults self.latest_faults = system[self.FAULT_KEY] #If no in-memory fault cache built yet! if not self.memcache_faults: # build from persistent cache if available logger.info( "No cached faults, building from persistent cache {0}"\ .format(self.faults_persistent_cache)) self.memcache_faults = store.get( self.faults_persistent_cache) # still if none, build from latest faults & persist if not self.memcache_faults: logger.info("No persistent faults cache, building " "cache from latest faults") self.memcache_faults = self.latest_faults # On SSPL boot, run through existing faults as no cache to # verify with for new faults self.existing_faults = True #logger.debug("existing_faults {0}".\ # format(self.existing_faults)) store.put(self.memcache_faults, self.faults_persistent_cache) else: # Reset flag as existing faults processed by now # and cached faults are built already self.existing_faults = False else: logger.error("poll system failed with err %d" % api_resp)
def _generate_if_data(self): """Create & transmit a network interface data message as defined by the sensor response json schema""" event_field = "" # Notify the node sensor to update its data required for the if_data message successful = self._node_sensor.read_data("if_data", self._get_debug()) if not successful: logger.error("NodeDataMsgHandler, _generate_if_data was NOT successful.") interfaces = self._node_sensor.if_data nw_alerts = self._get_nwalert(interfaces) # Get all cable connections state and generate alert on # cables identified for fault detected and resolved state nw_cable_alerts = self._nw_cable_alert_exists(interfaces) for nw_cable_resource_id, state in nw_cable_alerts.items(): severity = self.severity_reader.map_severity(state) # Check if any nw interface fault is there because of cable pull if nw_alerts and nw_alerts[nw_cable_resource_id] == state: if state == self.FAULT: self.INTERFACE_FAULT_DETECTED = True # if yes, then mark the flag detection True for the respective interface self.interface_fault_state[nw_cable_resource_id] = self.INTERFACE_FAULT_DETECTED event_field = f'Network interface: {nw_cable_resource_id}' + ' ' \ 'is also down because of cable fault' else: event_field = f'Network interface: {nw_cable_resource_id}' + ' ' \ 'is also up after cable insertion' # Send the cable alert self._send_ifdata_json_msg("nw", nw_cable_resource_id, self.NW_CABLE_RESOURCE_TYPE, state, severity, event_field) # Check for Nw interface fault for nw_resource_id, nw_state in nw_alerts.items(): # Check if nw interface fault is resolved. If resolved, check whether its # resolved by cable insertion by checking the self.interface_fault_state # dictionary. if (self.interface_fault_state and nw_state == self.FAULT_RESOLVED and not \ self.interface_fault_state.get(nw_resource_id)): # delete the entry for that interface from the interface # directory specifically maintaned to track interface # fault in case of cable fault. This is imp because otherwise # if fault occurs for the same nw interface after cable insertion case, # fault_resolved alert for the same nw interface will not be seen. del self.interface_fault_state[nw_resource_id] continue elif self.interface_fault_state.get(nw_resource_id): # If yes, then don't repeat the alert. continue if nw_state == self.FAULT: event_field = f'Network interface {nw_resource_id} is down' else: event_field = f'Network interface {nw_resource_id} is up' # If no or for othe interface, send the alert severity = self.severity_reader.map_severity(nw_state) self._send_ifdata_json_msg("nw", nw_resource_id, self.NW_RESOURCE_TYPE, nw_state, severity, event_field)
def get_systemd_service_info(self, service_name): """Get info of specified service using dbus API.""" try: unit = Service()._bus.get_object( const.SYSTEMD_BUS, Service()._manager.LoadUnit(service_name)) properties_iface = Interface(unit, dbus_interface=PROPERTIES_IFACE) except DBusException as err: logger.error( self.log.svc_log( f"Unable to initialize {service_name} due to {err}")) return None path_array = properties_iface.Get(const.SERVICE_IFACE, 'ExecStart') try: command_line_path = str(path_array[0][0]) except IndexError as err: logger.error( self.log.svc_log( f"Unable to find {service_name} path due to {err}")) command_line_path = "NA" is_installed = True if command_line_path != "NA" or 'invalid' in properties_iface.Get( const.UNIT_IFACE, 'UnitFileState') else False uid = str(properties_iface.Get(const.UNIT_IFACE, 'Id')) if not is_installed: health_status = "NA" health_description = f"Software enabling {uid} is not installed" recommendation = "NA" specifics = [{ "service_name": uid, "description": "NA", "installed": str(is_installed).lower(), "pid": "NA", "state": "NA", "substate": "NA", "status": "NA", "license": "NA", "version": "NA", "command_line_path": "NA" }] else: service_license = "NA" version = "NA" service_description = str( properties_iface.Get(const.UNIT_IFACE, 'Description')) state = str(properties_iface.Get(const.UNIT_IFACE, 'ActiveState')) substate = str(properties_iface.Get(const.UNIT_IFACE, 'SubState')) service_status = 'enabled' if 'disabled' not in properties_iface.Get( const.UNIT_IFACE, 'UnitFileState') else 'disabled' pid = "NA" if state == "inactive" else str( properties_iface.Get(const.SERVICE_IFACE, 'ExecMainPID')) try: version = Service().get_service_info_from_rpm(uid, "VERSION") except ServiceError as err: logger.error( self.log.svc_log( f"Unable to get service version due to {err}")) try: service_license = Service().get_service_info_from_rpm( uid, "LICENSE") except ServiceError as err: logger.error( self.log.svc_log( f"Unable to get service license due to {err}")) specifics = [{ "service_name": uid, "description": service_description, "installed": str(is_installed).lower(), "pid": pid, "state": state, "substate": substate, "status": service_status, "license": service_license, "version": version, "command_line_path": command_line_path }] if service_status == 'enabled' and state == 'active' \ and substate == 'running': health_status = 'OK' health_description = f"{uid} is in good health" recommendation = "NA" else: health_status = state health_description = f"{uid} is not in good health" recommendation = const.DEFAULT_RECOMMENDATION service_info = self.get_health_template(uid, is_fru=False) self.set_health_data(service_info, health_status, health_description, recommendation, specifics) return service_info
def _generate_disk_space_alert(self): """Create & transmit a disk_space_alert message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the disk_space_data message successful = self._node_sensor.read_data("disk_space_alert", self._get_debug(), self._units) if not successful: logger.error("NodeDataMsgHandler, _generate_disk_space_alert was NOT successful.") return # Changing disk_usage_threshold type according to what value type entered in config file self._disk_usage_threshold = str(self._disk_usage_threshold) try: if self._disk_usage_threshold.isdigit(): self._disk_usage_threshold = int(self._disk_usage_threshold) else: self._disk_usage_threshold = float(self._disk_usage_threshold) except ValueError: logger.warning("Disk Space Alert, Invalid disk_usage_threshold value are entered in config.") # Assigning default value to _disk_usage_threshold self._disk_usage_threshold = self.DEFAULT_DISK_USAGE_THRESHOLD if self._node_sensor.disk_used_percentage >= self._disk_usage_threshold: if not self.disk_fault: self.disk_fault = True # Create the disk space data message and hand it over to the egress processor to transmit fault_event = "Disk usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.disk_used_percentage, self._disk_usage_threshold) logger.warning(fault_event) diskSpaceAlertMsg = DiskSpaceAlertMsg(self._node_sensor.host_id, self._epoch_time, self._node_sensor.total_space, self._node_sensor.free_space, self._node_sensor.disk_used_percentage, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT,fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: diskSpaceAlertMsg.set_uuid(self._uuid) jsonMsg = diskSpaceAlertMsg.getJson() self.disk_sensor_data = jsonMsg self.os_sensor_type["disk_space"] = self.disk_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.disk_used_percentage <= self._disk_usage_threshold) and (self.disk_fault == True): # Create the disk space data message and hand it over to the egress processor to transmit fault_resolved_event = "Disk usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.disk_used_percentage, self._disk_usage_threshold) logger.warning(fault_resolved_event) diskSpaceAlertMsg = DiskSpaceAlertMsg(self._node_sensor.host_id, self._epoch_time, self._node_sensor.total_space, self._node_sensor.free_space, self._node_sensor.disk_used_percentage, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT_RESOLVED, fault_resolved_event ) # Add in uuid if it was present in the json request if self._uuid is not None: diskSpaceAlertMsg.set_uuid(self._uuid) jsonMsg = diskSpaceAlertMsg.getJson() self.disk_sensor_data = jsonMsg self.os_sensor_type["disk_space"] = self.disk_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.disk_fault = False
def _transmit_msg_on_exchange(self): """Transmit json message onto RabbitMQ exchange""" self._log_debug("_transmit_msg_on_exchange, jsonMsg: %s" % self._jsonMsg) try: # Check for shut down message from sspl_ll_d and set a flag to shutdown # once our message queue is empty if self._jsonMsg.get("message").get("actuator_response_type") is not None and \ self._jsonMsg.get("message").get("actuator_response_type").get("thread_controller") is not None and \ self._jsonMsg.get("message").get("actuator_response_type").get("thread_controller").get("thread_response") == \ "SSPL-LL is shutting down": logger.info("RabbitMQegressProcessor, _transmit_msg_on_exchange, received" \ "global shutdown message from sspl_ll_d") self._request_shutdown = True msg_props = pika.BasicProperties() msg_props.content_type = "text/plain" # Publish json message to the correct channel # NOTE: We need to route ThreadController messages to ACK channel. # We can't modify schema as it will affect other modules too. As a # temporary solution we have added a extra check to see if actuator_response_type # is "thread_controller". # TODO: Find a proper way to solve this issue. Avoid changing # core egress processor code if self._jsonMsg.get("message").get("actuator_response_type") is not None and \ (self._jsonMsg.get("message").get("actuator_response_type").get("ack") is not None or \ self._jsonMsg.get("message").get("actuator_response_type").get("thread_controller") is not None): self._add_signature() jsonMsg = json.dumps(self._jsonMsg).encode('utf8') self._ack_connection.publish(exchange=self._exchange_name, routing_key=self._ack_routing_key, properties=msg_props, body=jsonMsg) # Routing requests for IEM msgs sent from the LoggingMsgHandler elif self._jsonMsg.get("message").get("IEM_routing") is not None: log_msg = self._jsonMsg.get("message").get("IEM_routing").get("log_msg") self._log_debug("Routing IEM: %s" % log_msg) if self._iem_route_addr != "": self._iem_connection.publish(exchange=self._iem_route_exchange_name, routing_key=self._routing_key, properties=msg_props, body=str(log_msg)) else: logger.warn("RabbitMQegressProcessor, Attempted to route IEM without a valid 'iem_route_addr' set.") else: self._add_signature() jsonMsg = json.dumps(self._jsonMsg).encode('utf8') try: self._connection.publish(exchange=self._exchange_name, routing_key=self._routing_key, properties=msg_props, body=jsonMsg) except connection_exceptions: logger.error("RabbitMQegressProcessor, _transmit_msg_on_exchange, rabbitmq connectivity lost, adding message to consul %s" % self._jsonMsg) store_queue.put(jsonMsg) # No exceptions thrown so success self._log_debug("_transmit_msg_on_exchange, Successfully Sent: %s" % self._jsonMsg) # If event is added by sensors, set it if self._event: self._event.set() except Exception as ex: logger.error("RabbitMQegressProcessor, _transmit_msg_on_exchange: %r" % ex)
def _process_msg(self, ch, method, properties, body): """Parses the incoming message and hands off to the appropriate module""" ingressMsg = {} uuid = None try: if isinstance(body, dict) is False: ingressMsg = json.loads(body) else: ingressMsg = body # Authenticate message using username and signature fields username = ingressMsg.get("username") signature = ingressMsg.get("signature") message = ingressMsg.get("message") uuid = ingressMsg.get("uuid") msg_len = len(message) + 1 if uuid is None: uuid = "N/A" if use_security_lib and \ SSPL_SEC.sspl_verify_message(msg_len, str(message), username, signature) != 0: logger.warn("RabbitMQingressProcessor, Authentication failed on message: %s" % ingressMsg) return # Get the incoming message type if message.get("actuator_request_type") is not None: msgType = message.get("actuator_request_type") # Validate against the actuator schema validate(ingressMsg, self._actuator_schema) elif message.get("sensor_request_type") is not None: msgType = message.get("sensor_request_type") # Validate against the sensor schema validate(ingressMsg, self._sensor_schema) else: # We only handle incoming actuator and sensor requests, ignore # everything else. return # Check for debugging being activated in the message header self._check_debug(message) self._log_debug("_process_msg, ingressMsg: %s" % ingressMsg) # Hand off to appropriate actuator message handler if msgType.get("logging") is not None: self._write_internal_msgQ("LoggingMsgHandler", message) elif msgType.get("thread_controller") is not None: self._write_internal_msgQ("ThreadController", message) elif msgType.get("service_controller") is not None: self._write_internal_msgQ("ServiceMsgHandler", message) elif msgType.get("node_controller") is not None: self._write_internal_msgQ("NodeControllerMsgHandler", message) elif msgType.get("storage_enclosure") is not None: self._write_internal_msgQ("RealStorActuatorMsgHandler", message) # Hand off to appropriate sensor message handler elif msgType.get("node_data") is not None: self._write_internal_msgQ("NodeDataMsgHandler", message) elif msgType.get("enclosure_alert") is not None: self._write_internal_msgQ("RealStorEnclMsgHandler", message) elif msgType.get("storage_enclosure") is not None: self._write_internal_msgQ("RealStorActuatorMsgHandler", message) # ... handle other incoming messages that have been validated else: # Send ack about not finding a msg handler ack_msg = AckResponseMsg("Error Processing Message", "Message Handler Not Found", uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), ack_msg) # Acknowledge message was received self._connection.ack(ch, delivery_tag=method.delivery_tag) except Exception as ex: logger.error("RabbitMQingressProcessor, _process_msg unrecognized message: %r" % ingressMsg) ack_msg = AckResponseMsg("Error Processing Msg", "Msg Handler Not Found", uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), ack_msg)