Exemple #1
0
    def ws_request(self, url, method, retry_count=MAX_RETRIES,
            post_data=""):
        """Make webservice requests using common utils"""
        response = None
        retried_login = False
        need_relogin = False
        tried_alt_ip = False

        while retry_count:
            if tried_alt_ip:
                # Extract show fru name from old URL to update alternative IP.
                url = self.build_url(url[url.index('/api/'):].replace('/api',''))

            response = self.ws.ws_request(method, url,
                       self.common_reqheaders, post_data,
                       self.WEBSERVICE_TIMEOUT)

            retry_count -= 1

            if response is None:
                continue

            self.ws_response_status = response.status_code

            if response.status_code == self.ws.HTTP_OK:

                self.mc_timeout_counter = 0

                try:
                    jresponse = json.loads(response.content)

                    #TODO: Need a way to check return-code 2 in more optimal way if possible,
                    # currently being checked for all http 200 responses
                    if jresponse:

                        if jresponse['status'][0]['return-code'] == self.CLIAPI_RESP_FAILURE:
                            response_status = jresponse['status'][0]['response']

                            # if call fails with invalid session key request
                            # seen in G280 fw version
                            if self.CLIAPI_RESP_INVSESSION in response_status:
                               need_relogin = True

                except ValueError as badjson:
                    logger.error("%s returned mal-formed json:\n%s" % (url, badjson))

            # http 403 forbidden request, login & retry
            elif (response.status_code == self.ws.HTTP_FORBIDDEN or \
                need_relogin) and retried_login is False:
                logger.info("%s failed, retrying after login " % (url))

                self.login()
                retried_login = True
                need_relogin = False
                continue

            elif (response.status_code == self.ws.HTTP_TIMEOUT or \
                     response.status_code == self.ws.HTTP_CONN_REFUSED or \
                     response.status_code == self.ws.HTTP_NO_ROUTE_TO_HOST) \
                     and tried_alt_ip is False:
                self.switch_to_alt_mc()
                tried_alt_ip = True
                self.mc_timeout_counter += 1
                continue

            break

        return response
    def __init__(self):
        super(RealStorEnclosure, self).__init__()

        # WS Request common headers
        self.ws = WebServices()
        self.common_reqheaders = {}

        self.encl_conf = self.CONF_SECTION_MC

        self.system_persistent_cache = self.encl_cache + "system/"
        self.faults_persistent_cache = self.system_persistent_cache + "faults.json"

        # Read in mc value from configuration file
        self.mc1 = self.conf_reader._get_value_with_default(
            self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("primary_controller_ip"), self.DEFAULT_MC_IP)
        self.mc1_wsport = self.conf_reader._get_value_with_default(
            self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("primary_controller_port"), '')
        self.mc2 = self.conf_reader._get_value_with_default(
            self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("secondary_controller_ip"), self.DEFAULT_MC_IP)
        self.mc2_wsport = self.conf_reader._get_value_with_default(
            self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("secondary_controller_port"), '')

        self.active_ip = self.mc1
        self.active_wsport = self.mc1_wsport

        self.user = self.conf_reader._get_value_with_default(
            self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("user"), self.DEFAULT_USER)
        self.passwd = self.conf_reader._get_value_with_default(
            self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("password"), self.DEFAULT_PASSWD)

        self.mc_interface = self.conf_reader._get_value_with_default(
                                self.encl_conf, COMMON_CONFIGS.get(self.encl_conf).get("mgmt_interface"), "cliapi")

        self.pollfreq = int(self.conf_reader._get_value_with_default(
            self.CONF_REALSTORSENSORS, "polling_frequency", self.DEFAULT_POLL))

        self.site_id = self.conf_reader._get_value_with_default(
                                                self.SYSTEM_INFORMATION,
                                                COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.SITE_ID),
                                                '001')
        self.rack_id = self.conf_reader._get_value_with_default(
                                                self.SYSTEM_INFORMATION,
                                                COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.RACK_ID),
                                                '001')
        self.node_id = self.conf_reader._get_value_with_default(
                                                self.SYSTEM_INFORMATION,
                                                COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.NODE_ID),
                                                '001')
        # Need to keep cluster_id string here to generate decryption key
        self.cluster_id = self.conf_reader._get_value_with_default(
                                                self.SYSTEM_INFORMATION,
                                                COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.CLUSTER_ID),
                                                '001')
        # Decrypt MC Password
        decryption_key = encryptor.gen_key(self.cluster_id, ServiceTypes.STORAGE_ENCLOSURE.value)
        self.passwd = encryptor.decrypt(decryption_key, self.passwd.encode('ascii'), "RealStoreEncl")

        if self.mc_interface not in self.realstor_supported_interfaces:
            logger.error("Unspported Realstor interface configured,"
                " monitoring and alerts generation may hamper")
            return

        # login to mc to get session key, required for querying resources
        # periodically
        self.login()
    def run(self):
        logger.info(f"Monitoring Services : {self.services_to_monitor}")
        try:
            # Register all the services to signal of 'PropertiesChanged' and
            # raise an alert if some service is not active on initially or if
            # Unit is not found for the service
            services_to_monitor_copy = copy.deepcopy(self.services_to_monitor)
            for service in services_to_monitor_copy:
                err = self.connect_to_prop_changed_signal(service)
                if err:
                    self.raise_alert(service, "N/A", "N/A", "N/A", "N/A",
                                     "N/A", "N/A", 0)
                    logger.error(
                        f"{service} is not active initially. \n Error {err}")
                else:
                    self.services_to_monitor.remove(service)

            logger.debug(f"failed_services : {self.failed_services}")
            logger.debug(f"services_to_monitor : {self.services_to_monitor}")

            # Retrieve the main loop which will be called in the run method
            self._loop = GLib.MainLoop()

            # Initialize the gobject threads and get its context
            GLib.threads_init()
            context = self._loop.get_context()

            time_to_check_lists = self.current_time() + self.polling_frequency

            # WHILE LOOP FUNCTION : every second we check for
            # properties change event if any generated (using context
            # iteration) and after a delay of polling frequency we
            # check for inactive processes.
            while self.is_running():
                # At interval of 'thread_sleep' check for events occured for
                # registered services and process them(call on_pro_changed())
                context.iteration(False)
                time.sleep(self.thread_sleep)

                # At interval of 'polling_freqency' process unregistered
                # services and services with not-active (intermidiate) state.
                if time_to_check_lists <= self.current_time():
                    time_to_check_lists = self.current_time() + \
                                            self.polling_frequency

                    # Try to bind the enabled services on the node to the
                    # signal whose Unit was earlier not found. On successfully
                    # registering for service state change signal, remove from
                    # local list as monitoring enabled through SystemD
                    # and to avoid re-registration.
                    services_to_monitor_copy = copy.deepcopy(
                        self.services_to_monitor)
                    for service in services_to_monitor_copy:
                        if not self.connect_to_prop_changed_signal(service):
                            self.services_to_monitor.remove(service)

                    # Check for services in intermidiate state(not active)
                    self.check_notactive_services()


            logger.info("ServiceMonitor gracefully breaking out " +\
                                "of dbus Loop, not restarting.")
        except GLib.Error as err:
            raise ThreadException(
                self.SENSOR_NAME,
                "Ungrecefully breaking out of GLib.MainLoop() with error: %s" %
                err)
        except DBusException as err:
            raise ThreadException(
                self.SENSOR_NAME,
                "Ungracefully breaking out of dbus loop with error: %s" % err)
        except Exception as err:
            raise ThreadException(self.SENSOR_NAME,
                "Ungracefully breaking out of ServiceMonitor:run() "\
                "with error: %s" % err)
Exemple #4
0
    def _put_enclosure_action(self, ctrl_action, ctrl_type, resource,
            enclosure_request):
        severity = "informational"
        message = "request performed successfully."
        invalid_args = False
        ctrl_cmd = ""
        action_type = ""
        if resource.startswith(("controller_a", "controller_b")):
            ctrl_name = resource.split("controller_")[1].split(':')[0]
        elif resource == "*":
            ctrl_name = "both"
        else:
            ctrl_name = ""
            invalid_args = True

        if ctrl_action == "shutdown":
            ctrl_cmd = f"{ctrl_action} {ctrl_name}"
            action_type = ctrl_action
        else:
            ctrl_cmd = f"{ctrl_action} {ctrl_type} {ctrl_name}"
            action_type = f"{ctrl_action} {ctrl_type}"

        if (ctrl_action == "restart" and not resource.endswith(('sc', 'mc', "*"))) \
            or (ctrl_action == "shutdown" and resource.endswith(('sc', 'mc'))) or \
            not ctrl_cmd.endswith((' a', ' b', ' both')):
            # Checking/validating that we are passing appropriate/correct
            # controller with "ctrl_cmd"
            # Example : shutdown <a|b|both> or restart sc <a|b|both>
            invalid_args = True

        if invalid_args:
            # Invalid resource 'shutdown abc' for an
            # 'ENCL: enclosure:fru:controller:shutdown' actuator request
            err_msg = "Invalid resource '{}' for an '{}' actuator request".format(
                resource, enclosure_request)
            logger.error(err_msg)
            raise Exception(err_msg)

        encl_response = self._get_encl_response(
            self.rssencl.URI_CLIAPI_BASE + ctrl_cmd.replace(' ', '/'),
            self.rssencl.ws.HTTP_GET
        )
        if encl_response == {}:
            # The control request not performed successfully.
            # Got HTTP Status != 200 or return_code != 0.
            severity = "warning"
            message = "request failed. Please try again...!"

        # Some info will be deleted from the "response_str" in the "make_response"
        # method which is not required further. We are keeping this to decide some
        # fields in the "make_response" method.
        response_str = {
            'shutdown' : {
                'message': 'Shutdown %s' % (message),
                'description': 'Shuts down the Storage Controller in a controller '
                    'module. This ensures that a proper failover sequence is used, '
                    'which includes stopping all I/O operations and writing any '
                    'data in write cache to disk.',
                'command': ctrl_cmd,
                'alert_type': 'control:shutdown',
                'severity': severity,
                'resource_type': 'enclosure:fru:controller'
                },
            'restart sc' : {
                'message': 'Restart / Start Storage Controller %s' % (message),
                'description' : 'Restarts the Storage Controller in a controller '
                    'module. When you restart a Storage Controller, it attempts '
                    'to shut down with a proper failover sequence, which includes '
                    'stopping all I/O operations and flushing the write cache to '
                    'disk, and then the Storage Controller restarts. Restarting a '
                    'Storage Controller restarts the corresponding Management '
                    'Controller.',
                'command': ctrl_cmd,
                'alert_type': 'control:restart',
                'severity': severity,
                'resource_type': 'enclosure:fru:controller:sc'
                },
            'restart mc' : {
                'message' : 'Restart / Start Management Controller %s' % (message),
                'description' : 'Restarts the Management Controller in a '
                    'controller module. When you restart a Management Controller,'
                    ' communication with it is lost until it successfully '
                    'restarts. If the restart fails, the partner Management '
                    'Controller remains active with full ownership of operations '
                    'and configuration information.',
                'command': ctrl_cmd,
                'alert_type': 'control:restart',
                'severity': severity,
                'resource_type': 'enclosure:fru:controller:mc'
                }
        }
        return response_str[action_type]
Exemple #5
0
    def perform_request(self, jsonMsg):
        """Performs the RealStor enclosure request

        @return: The response string from performing the request
        """
        response = "N/A"
        try:
            enclosure_request = jsonMsg.get("actuator_request_type").get("storage_enclosure").get("enclosure_request")
            enclosure_request_data = [
                s.strip() for s in enclosure_request.split(":")]
            ctrl_action = ""
            ctrl_type = ""
            if enclosure_request_data[-1] == "shutdown":
                # "ENCL: enclosure:fru:controller:shutdown"
                (request_type, _, component, component_type,
                    ctrl_action) = enclosure_request_data
            elif enclosure_request_data[-1] == "restart":
                # "ENCL: enclosure:fru:controller:sc:restart"
                # "ENCL: enclosure:fru:controller:mc:restart"
                (request_type, _, component, component_type,
                    ctrl_type, ctrl_action) = enclosure_request_data
            else:
                # "ENCL: enclosure:fru:controller"
                (request_type, _, component, component_type) = \
                    enclosure_request_data

            resource = jsonMsg.get("actuator_request_type").get("storage_enclosure").get("resource")
            if ctrl_action in self.CTRL_ACTION_LST:
                response = self.make_response(
                    self._put_enclosure_action(ctrl_action, ctrl_type,
                    resource.strip(), enclosure_request),
                    component, component_type, resource,
                    ctrl_action = ctrl_action)
            elif component == "fru":
                response = self.make_response(self.request_fru_func[
                    request_type][component_type](resource), component,
                    component_type, resource)
            elif component == "sensor":
                response = self.make_response(
                            self._get_sensor_data(sensor_type=component_type, sensor_name=resource),
                            component,
                            component_type,
                            resource)
            elif component == "interface":
                enclosure_type = enclosure_request.split(":")[2]
                if enclosure_type == ResourceTypes.INTERFACE.value:
                    response = self._handle_ports_request(enclosure_request, resource)
                else:
                    logger.error("Some unsupported interface passed, interface:{}".format(enclosure_type))
            elif component == "system":
                if component_type == 'info':
                    response = self.make_response(
                            self._get_system_info(),
                            component,
                            component_type,
                            resource)
                else:
                    logger.error("Unsupported system request :{}".format(enclosure_request))

        except Exception as e:
            logger.exception("Error while getting details for JSON: {}".format(jsonMsg))
            response = {"Error": e}

        return response
    def _read_config(self):
        """Configure the RabbitMQ exchange with defaults available"""
        try:
            self._virtual_host  = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.VIRT_HOST,
                                                                 'SSPL')

            # Read common RabbitMQ configuration
            self._primary_rabbitmq_host = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.PRIMARY_RABBITMQ_HOST,
                                                                 'localhost')

            # Read RabbitMQ configuration for sensor messages
            self._queue_name    = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.QUEUE_NAME,
                                                                 'sensor-queue')
            self._exchange_name = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.EXCHANGE_NAME,
                                                                 'sspl-out')
            self._routing_key   = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.ROUTING_KEY,
                                                                 'sensor-key')
            # Read RabbitMQ configuration for Ack messages
            self._ack_queue_name = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.ACK_QUEUE_NAME,
                                                                 'sensor-queue')
            self._ack_routing_key = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.ACK_ROUTING_KEY,
                                                                 'sensor-key')

            self._username = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.USER_NAME,
                                                                 'sspluser')
            self._password = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.PASSWORD,
                                                                 '')
            self._signature_user = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.SIGNATURE_USERNAME,
                                                                 'sspl-ll')
            self._signature_token = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.SIGNATURE_TOKEN,
                                                                 'FAKETOKEN1234')
            self._signature_expires = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.SIGNATURE_EXPIRES,
                                                                 "3600")
            self._iem_route_addr = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.IEM_ROUTE_ADDR,
                                                                 '')
            self._iem_route_exchange_name = self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                                 self.IEM_ROUTE_EXCHANGE_NAME,
                                                                 'sspl-in')

            cluster_id = self._conf_reader._get_value_with_default(self.SYSTEM_INFORMATION_KEY,
                                                                   COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(self.CLUSTER_ID_KEY),
                                                                   '')

            # Decrypt RabbitMQ Password
            decryption_key = encryptor.gen_key(cluster_id, ServiceTypes.RABBITMQ.value)
            self._password = encryptor.decrypt(decryption_key, self._password.encode('ascii'), "RabbitMQegressProcessor")

            if self._iem_route_addr != "":
                logger.info("         Routing IEMs to host: %s" % self._iem_route_addr)
                logger.info("         Using IEM exchange: %s" % self._iem_route_exchange_name)
        except Exception as ex:
            logger.error("RabbitMQegressProcessor, _read_config: %r" % ex)
Exemple #7
0
    def connect_to_prop_changed_signal(self, service):
        """
           Bind the service to a signal('PropertiesChanged').

           Fetch the service unit from systemd and its state, substate,
           pid etc. Bind the service to the sigle which will be triggered
           whenever the service changes it's state/substate. Also raise
           an alert if service is in failed/inactive state.
        """
        try:
            unit, _, state, substate, pid = self.get_service_status(
                service=service)

            if service in self.service_status:
                prev_state = self.service_status[service]['state']
                prev_substate = self.service_status[service]['substate']
                prev_pid = self.service_status[service]['pid']
            else:
                prev_state = prev_substate = prev_pid = "N/A"

            self.update_status_local_cache(service, state, substate, pid)

            Iunit2 = Interface(
                unit, dbus_interface='org.freedesktop.systemd1.Manager')

            Iunit2.connect_to_signal(
                'PropertiesChanged',
                lambda a, b, c, p=unit: self.on_prop_changed(a, b, c, p),
                dbus_interface=PROPERTIES_IFACE)

            logger.debug(f"{service}({pid}) state is {state}:{substate}")

            if state in ["activating", "reloading", "deactivating"]:
                if service not in self.not_active_services:
                    self.not_active_services[service] = \
                                    [self.current_time(), state, substate]
            elif state != "active" and service not in self.failed_services:
                self.raise_alert(service, prev_state, state, prev_substate,
                                 substate, prev_pid, pid, 0)
                if service in self.not_active_services:
                    self.not_active_services.pop(service)
                self.failed_services.append(service)
                logger.error(
                    f"{service} is not active initially. state = {state}:{substate}"
                )
            elif state == "active":
                if service in self.failed_services:
                    self.raise_alert(service, prev_state, state, prev_substate,
                                     substate, prev_pid, pid, 2)
                    self.failed_services.remove(service)
                    logger.info(
                        f"{service} returned to good state. state = {state}:{substate}"
                    )
                if service in self.not_active_services:
                    self.not_active_services.pop(service)

            self.update_persistent_cache()

            return None
        except DBusException as err:
            return err
    def _generate_host_update(self):
        """Create & transmit a host update message as defined
            by the sensor response json schema"""

        # Notify the node sensor to update its data required for the host_update message
        successful = self._node_sensor.read_data("host_update", self._get_debug(), self._units)
        if not successful:
            logger.error("NodeDataMsgHandler, _generate_host_update was NOT successful.")

        self._host_memory_usage_threshold = str(self._host_memory_usage_threshold)
        try:
            if self._host_memory_usage_threshold.isdigit():
                self._host_memory_usage_threshold = int(self._host_memory_usage_threshold)
            else:
                self._host_memory_usage_threshold = float(self._host_memory_usage_threshold)
        except ValueError:
            logger.warning("Host Memory Alert, Invalid host_memory_usage_threshold value are entered in config.")
            # Assigning default value to _disk_usage_threshold
            self._host_memory_usage_threshold = self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD
        if self._node_sensor.total_memory["percent"] >= self._host_memory_usage_threshold:
            # Create the disk space data message and hand it over to the egress processor to transmit
            if not self.host_fault:
                self.host_fault = True
                # Create the disk space data message and hand it over to the egress processor to transmit
                fault_event = "Host memory usage increased to %s, beyond configured threshold of %s" \
                                %(self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold)

                logger.warning(fault_event)

                logged_in_users = []
                # Create the host update message and hand it over to the egress processor to transmit
                hostUpdateMsg = HostUpdateMsg(self._node_sensor.host_id,
                                        self._epoch_time,
                                        self._node_sensor.boot_time,
                                        self._node_sensor.up_time,
                                        self._node_sensor.uname, self._units,
                                        self.site_id, self.rack_id,
                                        self.node_id, self.cluster_id,
                                        self._node_sensor.total_memory,
                                        self._node_sensor.logged_in_users,
                                        self._node_sensor.process_count,
                                        self._node_sensor.running_process_count,
                                        self.FAULT,
                                        fault_event
                                        )
                # Add in uuid if it was present in the json request
                if self._uuid is not None:
                    hostUpdateMsg.set_uuid(self._uuid)
                jsonMsg = hostUpdateMsg.getJson()
                # Transmit it out over rabbitMQ channel
                self.host_sensor_data = jsonMsg
                self.os_sensor_type["memory_usage"] = self.host_sensor_data
                self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg)

        if (self._node_sensor.total_memory["percent"] < self._host_memory_usage_threshold) and (self.host_fault == True):
                fault_resolved_event = "Host memory usage decreased to %s, lesser than configured threshold of %s" \
                                        %(self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold)
                logger.warning(fault_resolved_event)
                logged_in_users = []
                # Create the host update message and hand it over to the egress processor to transmit
                hostUpdateMsg = HostUpdateMsg(self._node_sensor.host_id,
                                        self._epoch_time,
                                        self._node_sensor.boot_time,
                                        self._node_sensor.up_time,
                                        self._node_sensor.uname, self._units,
                                        self.site_id, self.rack_id,
                                        self.node_id, self.cluster_id,
                                        self._node_sensor.total_memory,
                                        self._node_sensor.logged_in_users,
                                        self._node_sensor.process_count,
                                        self._node_sensor.running_process_count,
                                        self.FAULT_RESOLVED,
                                        fault_resolved_event
                                        )

                # Add in uuid if it was present in the json request
                if self._uuid is not None:
                    hostUpdateMsg.set_uuid(self._uuid)
                jsonMsg = hostUpdateMsg.getJson()
                # Transmit it out over rabbitMQ channel
                self.host_sensor_data = jsonMsg
                self.os_sensor_type["memory_usage"] = self.host_sensor_data

                self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg)
                self.host_fault = False
    def _generate_cpu_data(self):
        """Create & transmit a cpu_data message as defined
            by the sensor response json schema"""

        # Notify the node sensor to update its data required for the cpu_data message
        successful = self._node_sensor.read_data("cpu_data", self._get_debug())
        if not successful:
            logger.error("NodeDataMsgHandler, _generate_cpu_data was NOT successful.")

        self._cpu_usage_threshold = str(self._cpu_usage_threshold)
        try:
            if self._cpu_usage_threshold.isdigit():
                self._cpu_usage_threshold = int(self._cpu_usage_threshold)
            else:
                self._cpu_usage_threshold = float(self._cpu_usage_threshold)
        except ValueError:
            logger.warning("CPU Usage Alert, Invalid host_memory_usage_threshold value are entered in config.")
            # Assigning default value to _cpu_usage_threshold
            self._cpu_usage_threshold = self.DEFAULT_CPU_USAGE_THRESHOLD

        if self._node_sensor.cpu_usage >= self._cpu_usage_threshold:

            if not self.cpu_fault :
                self.cpu_fault = True
                # Create the cpu usage data message and hand it over to the egress processor to transmit

                fault_event = "CPU usage increased to %s, beyond configured threshold of %s" \
                                %(self._node_sensor.cpu_usage, self._cpu_usage_threshold)
                logger.warning(fault_event)

                # Create the local mount data message and hand it over to the egress processor to transmit
                cpuDataMsg = CPUdataMsg(self._node_sensor.host_id,
                                    self._epoch_time,
                                    self._node_sensor.csps,
                                    self._node_sensor.idle_time,
                                    self._node_sensor.interrupt_time,
                                    self._node_sensor.iowait_time,
                                    self._node_sensor.nice_time,
                                    self._node_sensor.softirq_time,
                                    self._node_sensor.steal_time,
                                    self._node_sensor.system_time,
                                    self._node_sensor.user_time,
                                    self._node_sensor.cpu_core_data,
                                    self._node_sensor.cpu_usage,
                                    self.site_id,
                                    self.rack_id,
                                    self.node_id,
                                    self.cluster_id,
                                    self.FAULT,
                                    fault_event
                                )

                # Add in uuid if it was present in the json request
                if self._uuid is not None:
                    cpuDataMsg.set_uuid(self._uuid)
                jsonMsg = cpuDataMsg.getJson()
                self.cpu_sensor_data = jsonMsg
                self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data

                # Transmit it out over rabbitMQ channel
                self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg)

        if (self._node_sensor.cpu_usage <= self._cpu_usage_threshold) and (self.cpu_fault == True):
            # Create the cpu usage data message and hand it over to the egress processor to transmit
            fault_resolved_event = "CPU usage decreased to %s, lesser than configured threshold of %s" \
                %(self._node_sensor.cpu_usage, self._cpu_usage_threshold)
            logger.warning(fault_resolved_event)

            # Create the local mount data message and hand it over to the egress processor to transmit
            cpuDataMsg = CPUdataMsg(self._node_sensor.host_id,
                                self._epoch_time,
                                self._node_sensor.csps,
                                self._node_sensor.idle_time,
                                self._node_sensor.interrupt_time,
                                self._node_sensor.iowait_time,
                                self._node_sensor.nice_time,
                                self._node_sensor.softirq_time,
                                self._node_sensor.steal_time,
                                self._node_sensor.system_time,
                                self._node_sensor.user_time,
                                self._node_sensor.cpu_core_data,
                                self._node_sensor.cpu_usage,
                                self.site_id,
                                self.rack_id,
                                self.node_id,
                                self.cluster_id,
                                self.FAULT_RESOLVED,
                                fault_resolved_event
                            )

            # Add in uuid if it was present in the json request
            if self._uuid is not None:
                cpuDataMsg.set_uuid(self._uuid)
            jsonMsg = cpuDataMsg.getJson()
            self.cpu_sensor_data = jsonMsg
            self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data

            # Transmit it out over rabbitMQ channel
            self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg)
            self.cpu_fault = False
Exemple #10
0
    def rss_cliapi_poll_disks(self, disk):
        """Retreive realstor disk info using cli api /show/disks"""

        # make ws request
        url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWDISKS)

        if (disk != self.RSS_DISK_GET_ALL):
            diskId = disk.partition("0.")[2]

            if (diskId.isdigit()):
                url = f"{url}/{disk}"
        url = f"{url}/detail"

        response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn(
                f"{self.rssencl.LDR_R1_ENCL}:: Disks status unavailable as ws request {url} failed"
            )
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            if url.find(self.rssencl.ws.LOOPBACK) == -1:
                logger.error(
                    f"{self.rssencl.LDR_R1_ENCL}:: http request {url} to poll disks failed with \
                       err {response.status_code}")
            return

        try:
            jresponse = json.loads(response.content)
        except ValueError as badjson:
            logger.error(f"{url} returned mal-formed json:\n{badjson}")

        if jresponse:
            api_resp = self.rssencl.get_api_status(jresponse['status'])
            #logger.debug("%s api response:%d" % (url.format(),api_resp))

            if ((api_resp == -1)
                    and (response.status_code == self.rssencl.ws.HTTP_OK)):
                logger.warn("/show/disks api response unavailable, "
                            "marking success as http code is 200")
                api_resp = 0

            if api_resp == 0:
                drives = jresponse['drives']

                # reset latest drive cache to build new
                self.latest_disks = {}
                self.invalidate_latest_disks_info = False

                for drive in drives:
                    slot = drive.get("slot", -1)
                    sn = drive.get("serial-number", "NA")
                    health = drive.get("health", "NA")

                    if slot != -1:
                        self.latest_disks[slot] = {
                            "serial-number": sn,
                            "health": health
                        }

                        #dump drive data to persistent cache
                        dcache_path = f"{self.disks_prcache}disk_{slot}.json"

                        # If drive is replaced, previous drive info needs
                        # to be retained in disk_<slot>.json.prev file and
                        # then only dump new data to disk_<slot>.json
                        path_exists, ret_val = store.exists(dcache_path)
                        if path_exists and ret_val == "Success":
                            prevdrive = store.get(dcache_path)

                            if prevdrive is not None:
                                prevsn = prevdrive.get("serial-number", "NA")
                                prevhealth = prevdrive.get("health", "NA")

                                if prevsn != sn or prevhealth != health:
                                    # Rename path
                                    store.put(store.get(dcache_path),
                                              dcache_path + ".prev")
                                    store.delete(dcache_path)

                                    store.put(drive, dcache_path)
                        elif not path_exists and ret_val == "Success":
                            store.put(drive, dcache_path)
                        else:
                            # Invalidate latest disks info if persistence store error encountered
                            logger.warn(
                                f"store.exists {dcache_path} return value {ret_val}"
                            )
                            self.invalidate_latest_disks_info = True
                            break

                if self.invalidate_latest_disks_info is True:
                    # Reset latest disks info
                    self.latest_disks = {}

            #If no in-memory cache, build from persistent cache
            if not self.memcache_disks:
                self._rss_build_disk_cache_from_persistent_cache()

            # if no memory cache still
            if not self.memcache_disks:
                self.memcache_disks = self.latest_disks
    def _process_msg(self, body):
        """Parses the incoming message and hands off to the appropriate module"""

        self._log_debug("_process_msg, body: %s" % body)

        ingressMsg = {}
        try:
            if isinstance(body, dict) is False:
                ingressMsg = json.loads(body)
            else:
                ingressMsg = body

            # Authenticate message using username and signature fields
            username = ingressMsg.get("username")
            signature = ingressMsg.get("signature")
            message = ingressMsg.get("message")

            assert (username is not None)
            assert (signature is not None)
            assert (message is not None)

            msg_len = len(message) + 1

            if SSPL_SEC.sspl_verify_message(msg_len, str(message), username,
                                            signature) != 0:
                logger.error(
                    "Authentication failed on message: %s" % ingressMsg)
                return

            # We're acting as HAlon so ignore actuator_requests
            #  and sensor_requests messages
            if message.get("actuator_request_type") is not None or \
                    message.get("sensor_request_type") is not None:
                return

            # Get the message type
            msgType = message.get("actuator_response_type")

            # If it's an incoming actuator msg then validate against
            #  Actuator Response schema
            if msgType is not None:
                validate(ingressMsg, self._actuator_schema)

            if msgType is None:
                msgType = message.get("sensor_response_type")
                validate(ingressMsg, self._sensor_schema)

                # Ignore drive status messages when thread starts up during tests
                if message.get("sensor_response_type").get(
                        "disk_status_drivemanager") is not None:
                    return
            # If the message comes from other SSPL hosts, do not pass that
            # message to internal queue. This happens as SSPL instances are
            # listening to common queues in a RabbitMQ cluster.
            if 'host_id' in msgType and socket.getfqdn() != msgType['host_id']:
                return
            # Write to the msg queue so the lettuce tests can
            #  retrieve it and examine for accuracy during automated testing
            self._write_internal_msgQ("RabbitMQingressProcessorTests", message)

        except Exception as ex:
            logger.exception(
                "_process_msg unrecognized message: %r" % ingressMsg)
Exemple #12
0
    def _rss_check_disks_presence(self):
        """Match cached realstor disk info with latest retrieved disks info """

        self.rss_cliapi_poll_disks(self.RSS_DISK_GET_ALL)

        if not self.memcache_disks:
            if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK:
                logger.warn(
                    "Last polled drives info in-memory cache "
                    "unavailable , unable to check drive presence change")
                return

        if not self.latest_disks:
            if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK:
                logger.warn(
                    "Latest polled drives info in-memory cache "
                    "unavailable, unable to check drive presence change")
            return

        # keys are disk slot numbers
        removed_disks = set(self.memcache_disks.keys()) - set(
            self.latest_disks.keys())
        inserted_disks = set(self.latest_disks.keys()) - set(
            self.memcache_disks.keys())

        # get populated slots in both caches
        populated = set(self.memcache_disks.keys()) & set(
            self.latest_disks.keys())

        # check for replaced disks
        for slot in populated:
            if self.memcache_disks[slot]['serial-number'] != self.latest_disks[
                    slot]['serial-number']:

                if slot not in removed_disks:
                    removed_disks.add(slot)

                if slot not in inserted_disks:
                    inserted_disks.add(slot)

        # If no difference seen between cached & latest set of disk list,
        # means no disk removal or insertion happened
        if not (removed_disks or inserted_disks):
            #logger.info("Disk presence state _NOT_ changed !!!")
            return

        self._event = Event()
        for slot in removed_disks:
            #get removed drive data from disk cache
            disk_datafile = f"{self.disks_prcache}disk_{slot}.json.prev"

            path_exists, _ = store.exists(disk_datafile)
            if not path_exists:
                disk_datafile = f"{self.disks_prcache}disk_{slot}.json"

            disk_info = store.get(disk_datafile)

            #raise alert for missing drive
            self._rss_raise_disk_alert(self.rssencl.FRU_MISSING, disk_info)
            # Wait till msg is sent to rabbitmq or added in consul for resending.
            # If timed out, do not update cache
            if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                store.delete(disk_datafile)
            self._event.clear()
        self._event = None

        for slot in inserted_disks:
            #get inserted drive data from disk cache
            disk_info = store.get(f"{self.disks_prcache}disk_{slot}.json")

            #raise alert for added drive
            self._rss_raise_disk_alert(self.rssencl.FRU_INSERTION, disk_info)

            # Update health status for inserted disk in memfault cache,
            # to raise fault alert after insertion if inserted disk status is not OK.
            if disk_info["health"] != "OK":
                for id_fault, cached_fault in enumerate(
                        self.rssencl.memcache_faults):
                    #fetch disk slot from component_id present in memcache_faults.
                    try:
                        component_id = cached_fault["component-id"]
                        if component_id.startswith('Disk 0'):
                            disk_id = int(cached_fault["component-id"].split()
                                          [1].split('.')[1])
                            if disk_id == slot:
                                self.rssencl.memcache_faults[id_fault][
                                    'health'] = "OK"
                    except Exception as e:
                        logger.error(f"Error in updating health status for \
                        inserted disk in memfault cache {e}")

        # Update cached disk data after comparison
        self.memcache_disks = self.latest_disks
        self.rssencl.memcache_frus.update({"disks": self.memcache_disks})

        return
    def _send_msg(self, iem_components, log_timestamp):
        """Creates JSON message from iem components and sends to message bus.
        """
        impact = "NA"
        recommendation = "NA"
        # IEM format is IEC:DESCRIPTION
        # IEC format is SEVERITY|SOURCEID|COMPONENTID|MODULEID|EVENTID
        # Field lengths ----1---|---1----|------3----|----3---|---4---
        # Example IEM -> "IEC: BO1001000001:Error in connecting to controller"
        # Actual IEC doesn't contain separator between fields. It is shown
        # here just for readability. Each field has fixed length.
        severity, source_id, component_id, module_id, event_id, description = \
                                                        [iem_components[i] for i in range(6)]

        # Check if severity level is valid
        if severity not in self.SEVERITY_LEVELS:
            logger.warn(f"Invalid Severity level: {severity}")
            return

        # Check for valid source id
        if source_id not in self.SOURCE_IDS:
            logger.warn(f"Invalid Source ID level: {source_id}")
            return

        # Check for valid event time
        event_time = self._get_epoch_time_from_timestamp(log_timestamp)
        if not event_time:
            logger.error(
                "Timestamp is not in required format, discarding the message")
            return

        # Check for other components
        args = {
            "_comp_id": component_id,
            "_module_id": module_id,
            "_event_id": event_id
        }
        if not self._are_components_in_range(**args):
            return

        # component-id for sspl=005
        if component_id == "005":
            event_code = component_id + module_id + event_id
            impact = Iem().EVENT_STRING[event_code][1]
            recommendation = Iem().EVENT_STRING[event_code][2]

        # Update severity and source_id
        alert_type = iem_severity_to_alert_mapping.get(severity)
        severity = iem_severity_types.get(severity, severity)
        source_id = iem_source_types.get(source_id, source_id)

        # Decode component_id, module_id and event_id
        component_id, module_id, event_id = self._decode_msg(
            f"{component_id}{module_id}{event_id}")

        info = {
            "site_id": self._site_id,
            "rack_id": self._rack_id,
            "node_id": self._node_id,
            "cluster_id": self._cluster_id,
            "source_id": source_id,
            "component_id": component_id,
            "module_id": module_id,
            "event_id": event_id,
            "severity": severity,
            "description": description,
            "impact": impact,
            "recommendation": recommendation,
            "alert_type": alert_type,
            "event_time": event_time,
            "IEC": "".join(iem_components[:-1])
        }
        iem_data_msg = IEMDataMsg(info)
        json_msg = iem_data_msg.getJson()
        self._write_internal_msgQ(EgressProcessor.name(), json_msg)
    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(SASPortSensor, self).initialize(conf_reader)

        super(SASPortSensor, self).initialize_msgQ(msgQlist)

        self._site_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{self.SITE_ID}",'DC01')
        self._rack_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{self.RACK_ID}",'RC01')
        self._node_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{self.NODE_ID}",'SN01')
        self._cluster_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{self.CLUSTER_ID}",'CC01')

        # Get the sas port implementor from configuration
        sas_port_utility = Conf.get(SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}",
                                    "sysfs")

        self.polling_interval = int(Conf.get(SSPL_CONF, f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}",
                                        self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SAS_PORT_SENSOR_DATA = os.path.join(cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}')

        alert_type = None

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(sas_port_utility)
            self._utility_instance.initialize()
            phy_status = None

            link_value_phy_status_collection = ()

            # Call to sas phy dirctory which will return a dictionary
            # which has phy_name to negotiated link rate mapping
            # Ex: {"phy-0:0": "<12.0, Unknown>"}
            self.phy_dir_to_linkrate_mapping = \
                    self._utility_instance.get_phy_negotiated_link_rate()

            # Iterate over populated dictionary and restructure it
            # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP.
            # {"phy-0:0": ("link_rate", <Up/Down>)}
            for phy, value in self.phy_dir_to_linkrate_mapping.items():
                if 'Gbit'.lower() in value.strip().lower():
                    phy_status = 'up'
                    # Increment global phy_link count for UP status
                    self.phy_link_count += 1
                else:
                    phy_status = 'fault'
                link_value_phy_status_collection = (value, phy_status)
                self.phy_dir_to_linkrate_mapping[phy] = link_value_phy_status_collection

            # Get the stored previous alert info
            self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA)
            self.check_and_send_alert()

        except KeyError as key_error:
            logger.error(
                "Unable to get the instance of {} \
                Utility. Hence shutting down the sensor".format(sas_port_utility))
            self.shutdown()
        except Exception as e:
            if e == errno.ENOENT:
                logger.error(
                    "Problem occured while reading from sas_phy \
                    directory. directory path doesn't directory. Hence \
                    shuting down the sensor")
            elif e == errno.EACCES:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     Not enough permission to read from the directory. \
                     Hence shuting down the sensor")
            else:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     {0}. Hence shuting down the sensor".format(e))
            self.shutdown()

        return True
Exemple #15
0
    def get_system_status(self):
        """Retreive realstor system state info using cli api /show/system"""

        # poll system would get invoked through multiple realstor sensors
        # with less frequency compared to configured polling frequency
        # adding check to comply with polling frequency
        elapsed = time.time() - self.poll_system_ts

        if elapsed < self.pollfreq:
            logger.warn("/show/system request came in {0} seconds,"
                "while configured polling frequency is {1} seconds,"
                "ignoring".format(elapsed, self.pollfreq))
            return

        system = None

        # make ws request
        url = self.build_url(self.URI_CLIAPI_SHOWSYSTEM)
        #logger.info("show system url: %s" % url)

        response = self.ws_request(url, self.ws.HTTP_GET)

        if not response:
            logger.warn("System status unavailable as ws request failed")
            return

        if response.status_code != self.ws.HTTP_OK:
            logger.info("{0}:: http request {1} polling system status failed"
                " with http err {2}".format(self.LDR_R1_ENCL, url, \
                response.status_code))
            return

        self.poll_system_ts = time.time()

        try:
            jresponse = json.loads(response.content)
        except ValueError as badjson:
            logger.error("%s returned mal-formed json:\n%s" % (url, badjson))

        if jresponse:
            api_resp = self.get_api_status(jresponse['status'])

            if ((api_resp == -1) and
                   (response.status_code == self.ws.HTTP_OK)):
                logger.warn("/show/system api response unavailable, "
                    "marking success as http code is 200")
                api_resp = 0

            if api_resp == 0:
                system = jresponse['system'][0]
                self.memcache_system = system

            if system:
                # Check if fault exists
                # TODO: use self.FAULT_KEY in system: system.key() generates
                # list and find item in that.
                if not self.FAULT_KEY in system.keys():
                    logger.debug("{0} Healthy, no faults seen".format(self.LDR_R1_ENCL))
                    self.latest_faults = {}
                    return

                # Extract system faults
                self.latest_faults = system[self.FAULT_KEY]

                #If no in-memory fault cache built yet!
                if not self.memcache_faults:
                    # build from persistent cache if available
                    logger.info(
                        "No cached faults, building from  persistent cache {0}"\
                        .format(self.faults_persistent_cache))

                    self.memcache_faults = store.get(
                                               self.faults_persistent_cache)

                    # still if none, build from latest faults & persist
                    if not self.memcache_faults:
                        logger.info("No persistent faults cache, building "
                            "cache from latest faults")

                        self.memcache_faults = self.latest_faults

                        # On SSPL boot, run through existing faults as no cache to
                        # verify with for new faults
                        self.existing_faults = True

                        #logger.debug("existing_faults {0}".\
                        #    format(self.existing_faults))

                        store.put(self.memcache_faults,
                            self.faults_persistent_cache)
                else:
                     # Reset flag as existing faults processed by now
                     # and cached faults are built already
                     self.existing_faults = False
            else:
                logger.error("poll system failed with err %d" % api_resp)
    def _generate_if_data(self):
        """Create & transmit a network interface data message as defined
            by the sensor response json schema"""

        event_field = ""

        # Notify the node sensor to update its data required for the if_data message
        successful = self._node_sensor.read_data("if_data", self._get_debug())
        if not successful:
            logger.error("NodeDataMsgHandler, _generate_if_data was NOT successful.")
        interfaces = self._node_sensor.if_data

        nw_alerts = self._get_nwalert(interfaces)

        # Get all cable connections state and generate alert on
        # cables identified for fault detected and resolved state
        nw_cable_alerts = self._nw_cable_alert_exists(interfaces)
        for nw_cable_resource_id, state in nw_cable_alerts.items():
            severity = self.severity_reader.map_severity(state)

            # Check if any nw interface fault is there because of cable pull
            if nw_alerts and nw_alerts[nw_cable_resource_id] == state:
                if state == self.FAULT:
                    self.INTERFACE_FAULT_DETECTED = True

                    # if yes, then mark the flag detection True for the respective interface
                    self.interface_fault_state[nw_cable_resource_id] = self.INTERFACE_FAULT_DETECTED
                    event_field = f'Network interface: {nw_cable_resource_id}' + ' ' \
                                   'is also down because of cable fault'
                else:
                    event_field = f'Network interface: {nw_cable_resource_id}' + ' ' \
                                   'is also up after cable insertion'

            # Send the cable alert
            self._send_ifdata_json_msg("nw", nw_cable_resource_id, self.NW_CABLE_RESOURCE_TYPE, state, severity, event_field)

        # Check for Nw interface fault
        for nw_resource_id, nw_state in nw_alerts.items():
            # Check if nw interface fault is resolved. If resolved, check whether its
            # resolved by cable insertion by checking the self.interface_fault_state
            # dictionary.
            if (self.interface_fault_state and nw_state == self.FAULT_RESOLVED and not \
               self.interface_fault_state.get(nw_resource_id)):

                # delete the entry for that interface from the interface
                # directory specifically maintaned to track interface
                # fault in case of cable fault. This is imp because otherwise
                # if fault occurs for the same nw interface after cable insertion case,
                # fault_resolved alert for the same nw interface will not be seen.
                del self.interface_fault_state[nw_resource_id]
                continue

            elif self.interface_fault_state.get(nw_resource_id):
                # If yes, then don't repeat the alert.
                continue

            if nw_state == self.FAULT:
                event_field = f'Network interface {nw_resource_id} is down'
            else:
                event_field = f'Network interface {nw_resource_id} is up'

            # If no or for othe interface, send the alert
            severity = self.severity_reader.map_severity(nw_state)
            self._send_ifdata_json_msg("nw", nw_resource_id, self.NW_RESOURCE_TYPE, nw_state, severity, event_field)
    def get_systemd_service_info(self, service_name):
        """Get info of specified service using dbus API."""
        try:
            unit = Service()._bus.get_object(
                const.SYSTEMD_BUS,
                Service()._manager.LoadUnit(service_name))
            properties_iface = Interface(unit, dbus_interface=PROPERTIES_IFACE)
        except DBusException as err:
            logger.error(
                self.log.svc_log(
                    f"Unable to initialize {service_name} due to {err}"))
            return None
        path_array = properties_iface.Get(const.SERVICE_IFACE, 'ExecStart')
        try:
            command_line_path = str(path_array[0][0])
        except IndexError as err:
            logger.error(
                self.log.svc_log(
                    f"Unable to find {service_name} path due to {err}"))
            command_line_path = "NA"

        is_installed = True if command_line_path != "NA" or 'invalid' in properties_iface.Get(
            const.UNIT_IFACE, 'UnitFileState') else False
        uid = str(properties_iface.Get(const.UNIT_IFACE, 'Id'))
        if not is_installed:
            health_status = "NA"
            health_description = f"Software enabling {uid} is not installed"
            recommendation = "NA"
            specifics = [{
                "service_name": uid,
                "description": "NA",
                "installed": str(is_installed).lower(),
                "pid": "NA",
                "state": "NA",
                "substate": "NA",
                "status": "NA",
                "license": "NA",
                "version": "NA",
                "command_line_path": "NA"
            }]
        else:
            service_license = "NA"
            version = "NA"
            service_description = str(
                properties_iface.Get(const.UNIT_IFACE, 'Description'))
            state = str(properties_iface.Get(const.UNIT_IFACE, 'ActiveState'))
            substate = str(properties_iface.Get(const.UNIT_IFACE, 'SubState'))
            service_status = 'enabled' if 'disabled' not in properties_iface.Get(
                const.UNIT_IFACE, 'UnitFileState') else 'disabled'
            pid = "NA" if state == "inactive" else str(
                properties_iface.Get(const.SERVICE_IFACE, 'ExecMainPID'))
            try:
                version = Service().get_service_info_from_rpm(uid, "VERSION")
            except ServiceError as err:
                logger.error(
                    self.log.svc_log(
                        f"Unable to get service version due to {err}"))
            try:
                service_license = Service().get_service_info_from_rpm(
                    uid, "LICENSE")
            except ServiceError as err:
                logger.error(
                    self.log.svc_log(
                        f"Unable to get service license due to {err}"))

            specifics = [{
                "service_name": uid,
                "description": service_description,
                "installed": str(is_installed).lower(),
                "pid": pid,
                "state": state,
                "substate": substate,
                "status": service_status,
                "license": service_license,
                "version": version,
                "command_line_path": command_line_path
            }]
            if service_status == 'enabled' and state == 'active' \
                    and substate == 'running':
                health_status = 'OK'
                health_description = f"{uid} is in good health"
                recommendation = "NA"
            else:
                health_status = state
                health_description = f"{uid} is not in good health"
                recommendation = const.DEFAULT_RECOMMENDATION

        service_info = self.get_health_template(uid, is_fru=False)
        self.set_health_data(service_info, health_status, health_description,
                             recommendation, specifics)
        return service_info
    def _generate_disk_space_alert(self):
        """Create & transmit a disk_space_alert message as defined
            by the sensor response json schema"""

        # Notify the node sensor to update its data required for the disk_space_data message
        successful = self._node_sensor.read_data("disk_space_alert", self._get_debug(), self._units)
        if not successful:
            logger.error("NodeDataMsgHandler, _generate_disk_space_alert was NOT successful.")
            return

        # Changing disk_usage_threshold type according to what value type entered in config file
        self._disk_usage_threshold = str(self._disk_usage_threshold)
        try:
            if self._disk_usage_threshold.isdigit():
                self._disk_usage_threshold = int(self._disk_usage_threshold)
            else:
                self._disk_usage_threshold = float(self._disk_usage_threshold)
        except ValueError:
            logger.warning("Disk Space Alert, Invalid disk_usage_threshold value are entered in config.")
            # Assigning default value to _disk_usage_threshold
            self._disk_usage_threshold = self.DEFAULT_DISK_USAGE_THRESHOLD

        if self._node_sensor.disk_used_percentage >= self._disk_usage_threshold:
            if not self.disk_fault:
                self.disk_fault = True
                # Create the disk space data message and hand it over to the egress processor to transmit
                fault_event = "Disk usage increased to %s, beyond configured threshold of %s" \
                                %(self._node_sensor.disk_used_percentage, self._disk_usage_threshold)
                logger.warning(fault_event)
                diskSpaceAlertMsg = DiskSpaceAlertMsg(self._node_sensor.host_id,
                                        self._epoch_time,
                                        self._node_sensor.total_space,
                                        self._node_sensor.free_space,
                                        self._node_sensor.disk_used_percentage,
                                        self._units,
                                        self.site_id, self.rack_id,
                                        self.node_id, self.cluster_id, self.FAULT,fault_event)

                # Add in uuid if it was present in the json request
                if self._uuid is not None:
                    diskSpaceAlertMsg.set_uuid(self._uuid)
                jsonMsg = diskSpaceAlertMsg.getJson()
                self.disk_sensor_data = jsonMsg
                self.os_sensor_type["disk_space"] = self.disk_sensor_data

                # Transmit it out over rabbitMQ channel
                self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg)

        if (self._node_sensor.disk_used_percentage <= self._disk_usage_threshold) and (self.disk_fault == True):
            # Create the disk space data message and hand it over to the egress processor to transmit
            fault_resolved_event = "Disk usage decreased to %s, lesser than configured threshold of %s" \
                                %(self._node_sensor.disk_used_percentage, self._disk_usage_threshold)
            logger.warning(fault_resolved_event)
            diskSpaceAlertMsg = DiskSpaceAlertMsg(self._node_sensor.host_id,
                                    self._epoch_time,
                                    self._node_sensor.total_space,
                                    self._node_sensor.free_space,
                                    self._node_sensor.disk_used_percentage,
                                    self._units,
                                    self.site_id,
                                    self.rack_id,
                                    self.node_id,
                                    self.cluster_id,
                                    self.FAULT_RESOLVED,
                                    fault_resolved_event
                                    )

            # Add in uuid if it was present in the json request
            if self._uuid is not None:
                diskSpaceAlertMsg.set_uuid(self._uuid)
            jsonMsg = diskSpaceAlertMsg.getJson()
            self.disk_sensor_data = jsonMsg
            self.os_sensor_type["disk_space"] = self.disk_sensor_data

            # Transmit it out over rabbitMQ channel
            self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg)
            self.disk_fault = False
    def _transmit_msg_on_exchange(self):
        """Transmit json message onto RabbitMQ exchange"""
        self._log_debug("_transmit_msg_on_exchange, jsonMsg: %s" % self._jsonMsg)

        try:
            # Check for shut down message from sspl_ll_d and set a flag to shutdown
            #  once our message queue is empty
            if self._jsonMsg.get("message").get("actuator_response_type") is not None and \
                self._jsonMsg.get("message").get("actuator_response_type").get("thread_controller") is not None and \
                self._jsonMsg.get("message").get("actuator_response_type").get("thread_controller").get("thread_response") == \
                    "SSPL-LL is shutting down":
                    logger.info("RabbitMQegressProcessor, _transmit_msg_on_exchange, received" \
                                    "global shutdown message from sspl_ll_d")
                    self._request_shutdown = True

            msg_props = pika.BasicProperties()
            msg_props.content_type = "text/plain"

            # Publish json message to the correct channel
            # NOTE: We need to route ThreadController messages to ACK channel.
            # We can't modify schema as it will affect other modules too. As a
            # temporary solution we have added a extra check to see if actuator_response_type
            # is "thread_controller".
            # TODO: Find a proper way to solve this issue. Avoid changing
            # core egress processor code
            if self._jsonMsg.get("message").get("actuator_response_type") is not None and \
              (self._jsonMsg.get("message").get("actuator_response_type").get("ack") is not None or \
                self._jsonMsg.get("message").get("actuator_response_type").get("thread_controller") is not None):
                self._add_signature()
                jsonMsg = json.dumps(self._jsonMsg).encode('utf8')
                self._ack_connection.publish(exchange=self._exchange_name,
                                             routing_key=self._ack_routing_key,
                                             properties=msg_props,
                                             body=jsonMsg)

            # Routing requests for IEM msgs sent from the LoggingMsgHandler
            elif self._jsonMsg.get("message").get("IEM_routing") is not None:
                log_msg = self._jsonMsg.get("message").get("IEM_routing").get("log_msg")
                self._log_debug("Routing IEM: %s" % log_msg)
                if self._iem_route_addr != "":
                    self._iem_connection.publish(exchange=self._iem_route_exchange_name,
                                                 routing_key=self._routing_key,
                                                 properties=msg_props,
                                                 body=str(log_msg))
                else:
                    logger.warn("RabbitMQegressProcessor, Attempted to route IEM without a valid 'iem_route_addr' set.")
            else:
                self._add_signature()
                jsonMsg = json.dumps(self._jsonMsg).encode('utf8')
                try:
                    self._connection.publish(exchange=self._exchange_name,
                                            routing_key=self._routing_key,
                                            properties=msg_props,
                                            body=jsonMsg)
                except connection_exceptions:
                    logger.error("RabbitMQegressProcessor, _transmit_msg_on_exchange, rabbitmq connectivity lost, adding message to consul %s" % self._jsonMsg)
                    store_queue.put(jsonMsg)

            # No exceptions thrown so success
            self._log_debug("_transmit_msg_on_exchange, Successfully Sent: %s" % self._jsonMsg)
            # If event is added by sensors, set it
            if self._event:
                self._event.set()

        except Exception as ex:
            logger.error("RabbitMQegressProcessor, _transmit_msg_on_exchange: %r" % ex)
Exemple #20
0
    def _process_msg(self, ch, method, properties, body):
        """Parses the incoming message and hands off to the appropriate module"""

        ingressMsg = {}
        uuid = None
        try:
            if isinstance(body, dict) is False:
                ingressMsg = json.loads(body)
            else:
                ingressMsg = body

            # Authenticate message using username and signature fields
            username = ingressMsg.get("username")
            signature = ingressMsg.get("signature")
            message = ingressMsg.get("message")
            uuid = ingressMsg.get("uuid")
            msg_len = len(message) + 1

            if uuid is None:
                uuid = "N/A"

            if use_security_lib and \
               SSPL_SEC.sspl_verify_message(msg_len, str(message), username, signature) != 0:
                logger.warn("RabbitMQingressProcessor, Authentication failed on message: %s" % ingressMsg)
                return

            # Get the incoming message type
            if message.get("actuator_request_type") is not None:
                msgType = message.get("actuator_request_type")

                # Validate against the actuator schema
                validate(ingressMsg, self._actuator_schema)

            elif message.get("sensor_request_type") is not None:
                msgType = message.get("sensor_request_type")

                # Validate against the sensor schema
                validate(ingressMsg, self._sensor_schema)

            else:
                # We only handle incoming actuator and sensor requests, ignore
                # everything else.
                return

            # Check for debugging being activated in the message header
            self._check_debug(message)
            self._log_debug("_process_msg, ingressMsg: %s" % ingressMsg)

            # Hand off to appropriate actuator message handler
            if msgType.get("logging") is not None:
                self._write_internal_msgQ("LoggingMsgHandler", message)

            elif msgType.get("thread_controller") is not None:
                self._write_internal_msgQ("ThreadController", message)

            elif msgType.get("service_controller") is not None:
                self._write_internal_msgQ("ServiceMsgHandler", message)

            elif msgType.get("node_controller") is not None:
                self._write_internal_msgQ("NodeControllerMsgHandler", message)

            elif msgType.get("storage_enclosure") is not None:
                self._write_internal_msgQ("RealStorActuatorMsgHandler", message)

            # Hand off to appropriate sensor message handler
            elif msgType.get("node_data") is not None:
                self._write_internal_msgQ("NodeDataMsgHandler", message)

            elif msgType.get("enclosure_alert") is not None:
                self._write_internal_msgQ("RealStorEnclMsgHandler", message)

            elif msgType.get("storage_enclosure") is not None:
                self._write_internal_msgQ("RealStorActuatorMsgHandler", message)
            # ... handle other incoming messages that have been validated
            else:
                # Send ack about not finding a msg handler
                ack_msg = AckResponseMsg("Error Processing Message", "Message Handler Not Found", uuid).getJson()
                self._write_internal_msgQ(RabbitMQegressProcessor.name(), ack_msg)

            # Acknowledge message was received
            self._connection.ack(ch, delivery_tag=method.delivery_tag)

        except Exception as ex:
            logger.error("RabbitMQingressProcessor, _process_msg unrecognized message: %r" % ingressMsg)
            ack_msg = AckResponseMsg("Error Processing Msg", "Msg Handler Not Found", uuid).getJson()
            self._write_internal_msgQ(RabbitMQegressProcessor.name(), ack_msg)