def create_cluster(self, name: str, user: str, secret: str, nodeid: str) -> dict: """ Create cluster if not created. Args: name (str): Cluster name. user (str): Cluster User. secret (str): Cluster passward. nodeid (str): Node name, nodeid of current node. Returns: dict: Return dictionary. {"status": "", "output":"", "error":""} """ try: self._check_non_empty(name=name, user=user, secret=secret, nodeid=nodeid) if not self._is_pcs_cluster_running(): self._auth_node(nodeid, user, secret) self._execute.run_cmd( const.PCS_SETUP_CLUSTER.replace("<cluster_name>", name).replace( "<node>", nodeid)) Log.info("Pacmaker cluster created, waiting to start node.") self._execute.run_cmd(const.PCS_CLUSTER_START_NODE) self._execute.run_cmd(const.PCS_CLUSTER_ENABLE) Log.info("Node started and enabled successfully.") time.sleep(const.BASE_WAIT_TIME * 2) if self._is_pcs_cluster_running(): if self.wait_for_node_online(nodeid): # TODO: Divide class into vm, hw when stonith is needed. self._execute.run_cmd(const.PCS_STONITH_DISABLE) return { "status": const.STATUSES.SUCCEEDED.value, "output": "Cluster created successfully.", "error": "" } else: raise ClusterManagerError("Node is not online.") else: raise ClusterManagerError("Cluster is not started.") except Exception as e: raise ClusterManagerError(f"Failed to create cluster. Error: {e}")
def get_status(self, component: CLUSTER_ELEMENTS = CLUSTER_ELEMENTS.CLUSTER.value, depth: int = 1, version: str = SYSTEM_HEALTH_OUTPUT_V2, **kwargs): """ Return health status for the requested components. Args: component ([CLUSTER_ELEMENTS]): The component whose health status is to be returned. depth ([int]): A depth of elements starting from the input "component" that the health status is to be returned. version ([str]): The health status json output version **kwargs([dict]): Variable number of arguments that are used as filters, e.g. "id" of the input "component". Returns: ([dict]): Returns dictionary. {"status": "Succeeded"/"Failed"/"Partial", "output": "", "error": ""} status: Succeeded, Failed, Partial output: Dictionary with element health status error: Error information if the request "Failed" """ try: Log.debug(f"Get {component} health status version {version} with depth {depth} and filters {kwargs}") component_id = None if GET_SYS_HEALTH_ARGS.ID.value in kwargs and kwargs[GET_SYS_HEALTH_ARGS.ID.value] != "": component_id = kwargs[GET_SYS_HEALTH_ARGS.ID.value] # Get the requested component level in the health hierarchy component_level = HealthHierarchy.get_component_level(component) # Set the depth to be returned, check for partial status. self._partial_status = False total_depth = HealthHierarchy.get_total_depth() if depth == 0: depth = total_depth else: depth += component_level - 1 # Decrement by 1 for the component level itself. if depth > HealthHierarchy.get_total_depth(): depth = total_depth self._partial_status = True Log.debug(f"{component} level {component_level}, depth to return {depth}, total available depth {total_depth}") self._id_not_found = False # Get raw status starting from cluster self._status_dict = self.get_status_raw(CLUSTER_ELEMENTS.CLUSTER.value) # Prepare and return the output output = StatusOutput(version) self._prepare_status(component, component_id = component_id, start_level = component_level, current_level = component_level, depth = depth, parent = output) status = const.STATUSES.SUCCEEDED.value if self._partial_status: status = const.STATUSES.PARTIAL.value if self._id_not_found: output_json = json.dumps({"status": const.STATUSES.FAILED.value, "output": "", "error": "Invalid id"}) else: output_json = json.dumps({"status": status, "output": json.loads(output.to_json()), "error": ""}) Log.debug(f"Output json {output_json}") return output_json except Exception as e: Log.error(f"Failed reading status. Error: {e}") raise HaSystemHealthException("Failed reading status")
async def _store_action(self, alert, action): """ Further parses the alert to store information such as: component: Actual Hw component which has been affected component_id: FRU_ID entity: enclosure/node entity_id: resource id """ try: sensor_response = alert.get(const.MESSAGE).get( const.SENSOR_RES_TYPE) info_dict = await self._set_db_key_info(sensor_response) if info_dict: await self._decision_db.store_event(info_dict[const.ENTITY], \ info_dict[const.ENTITY_ID], info_dict[const.COMPONENT], \ info_dict[const.COMPONENT_ID], info_dict[const.EVENT_TIME], action) except Exception as e: Log.error(f"Error occured during storing action. {e}")
def process(self, args: list): """ Process the command """ self.parse(args) command_executor = CmdFactory.get_executor(self.module_name, self.operation_name) exec_class = self.get_class(command_executor) # Call execute function of the appropriate executor class executor_class = exec_class() # Raise exception if user does not have proper permissions executor_class.validate_permissions() if executor_class.validate(): try: executor_class.execute() except Exception as err: Log.error(f"{traceback.format_exc()}, {err}") raise HACommandTerminated(f"CLI execution failed, Error: {err}")
def map_event(self, event_type: str) -> str: """Returns the status by mapping it against the event type""" try: status = self.EVENT_TO_STATUS_MAPPING[event_type] return status except KeyError as e: Log.error( f"StatusMapper, map_event, No equivalent event type found: {e}" ) raise HaStatusMapperException( f"StatusMapper, map_event, No equivalent event type found: {e}" ) except Exception as e: Log.error( f"StatusMapper, map_event, Exception occured while mapping event_type to status: {e}" ) raise HaStatusMapperException( f"StatusMapper, map_event, Exception while mapping event: {e}")
def process(self): """ Process test command. """ path_to_comp_config = const.SOURCE_CONFIG_PATH install_type = self.get_installation_type() nodes = self.get_nodelist(fetch_from=Cmd.HA_CONFSTORE) path_to_comp_config = path_to_comp_config + '/components/' + install_type rc = TestExecutor.validate_cluster(node_list=nodes, comp_files_dir=path_to_comp_config) Log.info(f"cluster validation rc = {rc}") if not rc: raise HaConfigException( "Cluster is no healthy. Check HA logs for further information." )
def _update_env(self, node_name: str, node_type: str, cluster_type: str) -> None: """ Update env like VM, HW """ Log.info(f"Detected {node_type} env and cluster_type {cluster_type}.") if "VM" == node_type.upper(): Conf.set(const.HA_GLOBAL_INDEX, "CLUSTER_MANAGER.env", node_type.upper()) else: # TODO: check if any env available other than vm, hw Conf.set(const.HA_GLOBAL_INDEX, "CLUSTER_MANAGER.env", "HW") Conf.set(const.HA_GLOBAL_INDEX, "CLUSTER_MANAGER.cluster_type", cluster_type) Conf.set(const.HA_GLOBAL_INDEX, "CLUSTER_MANAGER.local_node", node_name) Log.info("CONFIG: Update ha configuration files") Conf.save(const.HA_GLOBAL_INDEX)
def __init__(self, producer, k_object, **kwargs): """ Init method Initialization of member objects, and Thread super calss """ super().__init__() self._publish_alert = True self._object = k_object self.name = f"Monitor-{k_object}-Thread" self._args = kwargs self._starting_up = True self._object_state = {} self._sigterm_received = threading.Event() self._stop_event_processing = False self._producer = producer self._confstore = ConfigManager.get_confstore() self._published_alerts = {} Log.info(f"Initialization done for {self._object} monitor")
def delete_resources() -> None: """ Delete pacemaker resources. Exceptions: UpgradeError """ try: root = _get_cib_xml() resources = [e.attrib["id"] for e in root.findall(".//lrm_resource") if "id" in e.attrib] Log.info(f"Going to delete following resources: {resources}") for r in resources: Log.info(f"Deleting {r}") SimpleCommand().run_cmd(f"pcs resource delete {r}") except Exception as err: raise UpgradeError("Resource deletion failed") from err
def cluster_exists(self) -> dict: """ Check if cluster exists. Returns: dict: Return dictionary. {"status": "", "output": "", "error": ""} status: Succeeded """ try: result: bool = True if self._is_pcs_cluster_running() else False except Exception as e: Log.error(f"Cluster status is failed. error {e}") result = False return { "status": const.STATUSES.SUCCEEDED.value, "output": result, "error": "" }
def _keys_validate(self, phase_name: str): """Validate keys of each phase against argument file.""" phase_name = phase_name.upper() try: phase_list = self._get_list_of_phases_to_validate(phase_name) yardstick_list = [] yardstick_list_exp = [] for phase in phase_list: phase_key_list = self._get_keys_for_phase(phase) yardstick_list.extend(phase_key_list) for key in yardstick_list: new_key = self._expand_keys(key, phase_name) yardstick_list_exp.append(new_key) for key in yardstick_list_exp: self._key_value_verify(key, phase_name) Log.debug("%s - keys validation complete" % phase_name.lower()) except Exception: raise Exception("ERROR : Validating keys failed")
def reset(self): """ Performs reset. Raises exception on error """ # Check service status service_obj = Service('elasticsearch.service') service_state = service_obj.get_state() if service_state._state == 'active': Log.warn("Elasticsearch service in active state. \n" "Stopping Elasticsearch service now...") service_obj.stop() # Clear log files. Elasticsearch.truncate_log_files(self.log_path) Log.info("Reset done.") return 0
def _is_status_present(self, component, component_id: str = None) -> str: status_key = None if component_id is not None: for key in self._status_dict: if re.search(f"{component}/{component_id}/health", key): status_key = key break else: for key in self._status_dict: if re.search(f"{component}/.+/health", key): split_key = re.split("/", key) if component == split_key[-3]: status_key = key break Log.debug( f"Status key {status_key} present for component {component}, id {component_id}" ) return status_key
def get_resource_status(self, resource: AnyStr): """ Get the Status for Resource :param resource: Name of Resource :type: str :return: """ Log.debug(f"Received Status Request for resource {resource}") resource_key = self._resource_file.get("resources", {}).get(resource, {}) try: resource_data = self._loop.run_until_complete( self._consul_call.get(**resource_key)) except Exception as e: # Return OK if Failed to Fetch Resource Status. Log.error(f"{traceback.format_exc()} {e}") return Action.OK if resource_data: return resource_data[0].action return Action.OK
def __init__(self): """ Init method """ super(IEMFilter, self).__init__() IEMFilter.validate_filter( const.AlertEventConstants.IEM_FILTER_TYPE.value) # Get filter type and resource types list from the IEM rule file self.filter_type = Conf.get( const.ALERT_FILTER_INDEX, const.AlertEventConstants.IEM_FILTER_TYPE.value) self.components_list = Conf.get( const.ALERT_FILTER_INDEX, const.AlertEventConstants.IEM_COMPONENTS.value) self.modules_dict = Conf.get( const.ALERT_FILTER_INDEX, const.AlertEventConstants.IEM_MODULES.value) Log.info("IEM Filter is initialized ...")
def _delete_message_type(self, component: str) -> None: """ Deletes the message type created earlier Args: component (str): Component name. """ message_type = EVENT_MANAGER_KEYS.MESSAGE_TYPE_VALUE.value.replace( "<component_id>", component) MessageBus.deregister(message_type) # Remove message type key from confstore message_type_key = EVENT_MANAGER_KEYS.MESSAGE_TYPE_KEY.value.replace( "<component_id>", component) if self._confstore.key_exists(message_type_key): self._confstore.delete(message_type_key) Log.info( f"Unsubscribed component {component} from message_type {message_type}" )
def _get_metadata(self, admin: object): """To get the metadata information of message type.""" try: message_type_metadata = admin.list_topics( timeout=self._admin_api_timeout).__dict__ return message_type_metadata['topics'] except KafkaException as e: Log.error(f"MessageBusError: {errors.ERR_OP_FAILED}. "\ f"list_topics() failed. {e} Check if Kafka service is "\ f"running successfully") raise MessageBusError(errors.ERR_OP_FAILED, "list_topics() " +\ "failed. %s. Check if Kafka service is running successfully", e) except Exception as e: Log.error(f"MessageBusError: {errors.ERR_OP_FAILED}. "\ f"list_topics() failed. {e} Check if Kafka service is "\ f"running successfully") raise MessageBusError(errors.ERR_OP_FAILED, "list_topics() " +\ "failed. %s. Check if Kafka service is running successfully", e)
def _create_dir_and_set_kafka_ownership(directory: str): """ Creats directory if dosent exist and changes ownership to kafka:kafka. Args: directory (str): directory to be created Returns: int: Returns 0 on success """ try: os.makedirs(directory, exist_ok=True) os.system(f"chown -R kafka:kafka {directory}") except OSError as e: Log.error(f"Faild to creating directory & changing ownership:{e}") raise KafkaSetupError(rc=e.errno, message=e) return 0
def _get_uncompressed_size(size_limit: float): """Calculate the uncompressed size, assuming the tz utility compression to be 80% Using Formula: Data_Compression_ratio = 1-(compressed_size/uncompressed_size) compressed_size/uncompressed_size = 1-(4/5) = 1/5 uncompressed_size = compressed_size * 5 Multiplying compressed size limit with 5 to get the uncompressed size limit. For example: say the compressed size limit given is 200MB, uncompressed size limit = 200MB * 5 = 1000MB """ uncompressed_size = '' try: uncompressed_size = int(size_limit * 5) except ValueError as e: Log.error(f"Failed to get uncompressed size_limit. ERROR:{str(e)}") return uncompressed_size
def evaluate_status(self, health_event: HealthEvent) -> HealthEvent: """ Evaluate health event of children and return its health event. Args: health_event (HealthEvent): Health event of children Returns: HealthEvent: Health event of current element. """ cluster_id = health_event.cluster_id status = self.get_cluster_status(cluster_id) Log.info(f"Evaluated cluster {cluster_id} status as {status}") return self._get_new_event( event_type=status, resource_type=CLUSTER_ELEMENTS.CLUSTER.value, resource_id=cluster_id, subelement_event=health_event)
def get_fid(service: str, node_id: str, instance_id: int) -> str: """ Get Fid from hare mapping file for given services. Args: service ([str]): Service name. node_id ([str]): Node name for fid instance. instance_id ([int]): Instance id for service. Returns: str: Return fid of given service. """ try: return Hare.get_fid(service, node_id, instance_id) except Exception as e: Log.error(f"Failed to get fid for ({service}, {node_id}, \ {instance_id}). Error: {e}") return None
def filter_event(self, msg: str) -> bool: """ Filter event. Args: msg (str): Msg """ try: iem_required = False message = json.loads(msg).get(ALERT_ATTRIBUTES.MESSAGE) actuator_response_type = message.get( ALERT_ATTRIBUTES.ACTUATOR_RESPONSE_TYPE) if actuator_response_type is not None: return iem_required msg_type = IEMFilter.get_msg_type(message) if msg_type.lower() != MESSAGETYPE.IEM.value.lower(): return iem_required sensor_response_type = message.get( ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE) component_type = sensor_response_type[ ALERT_ATTRIBUTES.SPECIFIC_INFO][ALERT_ATTRIBUTES.COMPONENT] module_type = sensor_response_type[ALERT_ATTRIBUTES.SPECIFIC_INFO][ ALERT_ATTRIBUTES.MODULE] if self.filter_type == const.INCLUSION: if component_type in self.components_list and module_type in self.modules_dict.get( component_type): iem_required = True else: # EXCLUSION Rules if component_type not in self.components_list or module_type not in self.modules_dict.get( component_type): iem_required = True event_id = message.get(ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE).get( ALERT_ATTRIBUTES.ALERT_ID) Log.info(f"Successfully filtered event {event_id} ...") return iem_required except Exception as e: raise EventFilterException( f"Failed to filter IEM event. Message: {msg}, Error: {e}")
def _initalize_watcher(self, system_health: SystemHealth) -> dict: """ Initalize watcher. Args: system_health (SystemHealth): Instance of systemhealth. Returns: dict: mapping of wather_type -> watcher_instance """ watchers = Conf.get(const.HA_GLOBAL_INDEX, f"EVENT_ANALYZER{_DELIM}watcher") watcher_list: dict = {} for watcher in watchers: Log.info(f"Initializing watcher {watcher}....") event_filter_class = Conf.get( const.HA_GLOBAL_INDEX, f"EVENT_ANALYZER{_DELIM}watcher{_DELIM}{watcher}{_DELIM}event_filter" ) event_filter_instance = EventAnalyzerService.get_class( event_filter_class)() event_parser_class = Conf.get( const.HA_GLOBAL_INDEX, f"EVENT_ANALYZER{_DELIM}watcher{_DELIM}{watcher}{_DELIM}event_parser" ) event_parser_instance = EventAnalyzerService.get_class( event_parser_class)() watcher_list[watcher] = Watcher( consumer_id=Conf.get( const.HA_GLOBAL_INDEX, f"EVENT_ANALYZER{_DELIM}watcher{_DELIM}{watcher}{_DELIM}consumer_id" ), message_type=Conf.get( const.HA_GLOBAL_INDEX, f"EVENT_ANALYZER{_DELIM}watcher{_DELIM}{watcher}{_DELIM}message_type" ), consumer_group=Conf.get( const.HA_GLOBAL_INDEX, f"EVENT_ANALYZER{_DELIM}watcher{_DELIM}{watcher}{_DELIM}consumer_group" ), event_filter=event_filter_instance, event_parser=event_parser_instance, subscriber=system_health) return watcher_list
class CleanupCmd(SetupCmd): """Cleanup cmd initialization.""" Log.init('OpenldapProvisioning','/var/log/cortx/utils/openldap',\ level='DEBUG') def __init__(self, config: str): """Constructor.""" try: super(CleanupCmd, self).__init__(config) except Exception as e: Log.debug("Initializing cleanup phase failed") raise OpenldapPROVError(f'exception: {e}') def process(self): """Main processing function.""" try: self.delete_replication_config() self.delete_log_files() BaseConfig.cleanup(True) os.system('systemctl restart slapd') except Exception as e: raise OpenldapPROVError(f'exception: {e}\n') def _delete_file(self, filepath: str): """Delete file.""" if os.path.exists(filepath): try: file_shrink = open(filepath, "w") file_shrink.truncate() file_shrink.close() except Exception: Log.debug("Failed deleting log file : %s" % filepath) def delete_log_files(self): """Delete log files.""" Log.debug("Starting log file deletion") logFiles = [ "/var/log/cortx/utils/openldap/OpenldapProvisioning.log", "/var/log/slapd.log" ] for logFile in logFiles: self._delete_file(logFile) Log.debug("Cleanup completed, empty log file") def delete_replication_config(self): """Cleanup replication related config.""" Log.debug("Starting replication cleanup") conn = ldap.initialize("ldapi://") conn.sasl_non_interactive_bind_s('EXTERNAL') dn = "cn=config" Replication.deleteattribute(conn, dn, "olcServerID") dn = "olcDatabase={2}mdb,cn=config" Replication.deleteattribute(conn, dn, "olcSyncrepl") Replication.deleteattribute(conn, dn, "olcMirrorMode")
def setUp(self): Log.init(service_name='resource_agent', log_path=const.RA_LOG_DIR, level="DEBUG") self.decision_monitor = MagicMock() self.filename = 'node_iem_motr' self.path = 'io' self.decision_monitor.get_resource_status.side_effect = self._side_effect_group_status self.schema = { "nodes": { "27534128-7ecd-4606-bf42-ebc9765095ba": "cortxnode1.example.com", "f3c7d479-2249-40f4-9276-91ba59f50034": "cortxnode2.example.com", "local": "cortxnode1.example.com" } } self.status = None self.iem_agent = IEMResourceAgent(self.decision_monitor, self.schema)
def cleanup(self, *args, **kwargs): """ Performs Configuraiton cleanup. Raises exception on error. """ Log.info("starting cleanup phase") server_properties_file = '/opt/kafka/config/server.properties' default_server_properties = { 'broker.id': 0, 'log.dirs': '/tmp/kafka-logs', 'log.retention.check.interval.ms': 300000, 'zookeeper.connect': 'localhost:2181', 'transaction.state.log.min.isr': 1, 'offsets.topic.replication.factor': 1, 'transaction.state.log.replication.factor': 1, } delete_server_properties = [ 'listeners', 'log.delete.delay.ms', 'default.replication.factor', 'log.flush.offset.checkpoint.interval.ms' ] Kafka._update_properties_file(server_properties_file, default_server_properties) Kafka._delete_properties_from_file(server_properties_file, delete_server_properties) zookeeper_properties_file = '/opt/kafka/config/zookeeper.properties' default_zookeeper_properties = { 'clientPort': 2181, 'dataDir': '/tmp/zookeeper' } delete_zookeeper_properties = [ 'dataLogDir', 'tickTime', 'initLimit', 'syncLimit', 'dataLogDir', 'autopurge.snapRetainCount', 'server.', 'autopurge.purgeInterval', '4lw.commands.whitelist' ] Kafka._update_properties_file(zookeeper_properties_file, default_zookeeper_properties) Kafka._delete_properties_from_file(zookeeper_properties_file, delete_zookeeper_properties) Log.info("cleanup phase completed successfully") return 0
def evaluate(self, event: HealthEvent) -> list: """ Check if rule exists for received HealthEvent If yes, return the actions for that rule Args: HealthEvent Returns: list: actions configured for the rule """ val = [] key = self._prepare_key(event.resource_type, event.event_type) Log.debug(f"Evaluating rule for {key}") kv = self._get_val(key) if kv: _, val = self._get_k_v(kv) Log.info(f"Evaluated action {val} for key {key}") return val
def unsubscribe(self, component: SUBSCRIPTION_LIST, events: list, action: str = None) -> None: """ Deregister the event for the specific component and the component \ for the event using consul deletion Args: component (str): Component Name. events (list, optional): Event List. Defaults to None. Raise: UnSubscribeException: Raise error if failed. """ if isinstance(component, SUBSCRIPTION_LIST): component = component.value #TODO: Provide way to unsubscribe all event Log.info(f"Received unsubscribe for {component}") try: self._validate_component(component) self._validate_events(events) if not action: action = HEALTH_MON_ACTIONS.PUBLISH_ACT.value for event in events: self._delete_component_key(component, event.resource_type, event.states) self._delete_event_key(component, event.resource_type, event.states) for state in event.states: key = EVENT_MANAGER_KEYS.EVENT_KEY.value.replace( "<resource>", event.resource_type).replace("<state>", state) if not self._confstore.key_exists(key): self._monitor_rule.remove_rule(event.resource_type, state, self._default_action) Log.info(f"Successfully UnSubscribed component {component}") except InvalidComponent: raise except Exception as e: raise UnSubscribeException( f"Failed to unsubscribe {component}. Error: {e}")
def initialize(service_name, log_path=const.DEFAULT_LOG_PATH, level=const.DEFAULT_LOG_LEVEL, console_output=True): """ Initialize Logger: Initialize and use cortx-utils logger to log message in file and console. If console_output is True, log message will be displayed in console. """ if not CortxProvisionerLog.logger: if level not in const.SUPPORTED_LOG_LEVELS: level = const.DEFAULT_LOG_LEVEL Log.init(service_name, log_path, level=level, console_output=console_output, console_output_level=const.DEFAULT_CONSOLE_OUTPUT_LEVEL) CortxProvisionerLog.logger = Log.logger
def stop(self) -> int: """ Stop service. If stop failed it will cause stonith. Returns: int: Return as per service status. if unknown or failed then return success. timeout of stop will cause stonith. """ service = self._get_systemd_service() Log.debug(f"Stop: Stopping {service} service") self._execute.run_cmd(f"systemctl stop {service}", check_error=False) while True: status: str = self._get_service_status(service).strip() time.sleep(1) if status in ["failed", "unknown"]: break Log.info(f"Stop: Stopped {service} service") return const.OCF_SUCCESS