def __init__(self): # implement the singleton class self.__monitor_dao = HANAMonitorDAO(Mc.get_hana_server(), Mc.get_hana_port(), Mc.get_hana_user(), Mc.get_hana_password()) self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_ALARM)
def __init__(self): super().__init__() self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_COORDINATOR) self.__configs = {} self.__os_operator = LinuxOperator() self.__heartbeat_flag = False self.__heartbeat_interval = Mc.get_heartbeat_check_interval() self.__heartbeat_timeout = Mc.get_heartbeat_timeout() self.__heartbeat_restart_agent_interval = Mc.get_heartbeat_operation_interval( ) self.__heartbeat_agent_restart_info = {}
def __init__(self): self.__os_operator = LinuxOperator() operator = HANAMonitorDAO(Mc.get_hana_server(), Mc.get_hana_port(), Mc.get_hana_user(), Mc.get_hana_password()) self.servers = self.__get_servers(operator) # self.servers = [] # move belows to config? self.path = Mc.get_agent_path()[:-len('agent.py')] self.files = [ 'agent.py', 'util.py', 'errors.py', 'config/configuration.ini', 'config/logging.ini' ]
def __init__(self): # implement the singleton class if HANAOperatorService.__instance is not None: raise MonitorDBOpError( "This class is a singleton, use HANAServerDBOperatorService.instance() instead" ) else: HANAOperatorService.__instance = self self.__monitor_dao = HANAMonitorDAO(Mc.get_hana_server(), Mc.get_hana_port(), Mc.get_hana_user(), Mc.get_hana_password()) self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_DB)
def run(self): """run the thread""" operator = HANAMonitorDAO(Mc.get_hana_server(), Mc.get_hana_port(), Mc.get_hana_user(), Mc.get_hana_password()) while True: try: self.__monitoring_configurations(operator) except Exception as ex: Mu.log_warning_exc( self.__logger, "Error occurred when monitoring configuration, Error: {0}". format(ex)) time.sleep(self.__configs.get("CHECK_INTERVAL_CONFIG_INT", 300))
def __restart_agent(self, server, server_id, mount_point, agent_path, mem_interval, cpu_interval, disk_interval, instance_interval): with Mu.open_ssh_connection(self.__logger, self.__os_operator, server, Mc.get_ssh_default_user(), Mc.get_ssh_default_password()) as ssh: Mu.log_debug(self.__logger, "Restarting {0}".format(server)) self.__os_operator.restart_agent(ssh, server_id, mount_point, agent_path, mem_interval, cpu_interval, disk_interval, instance_interval) Mu.log_debug(self.__logger, "Restarting of {0} is done".format(server))
def get_failed_servers(self, check_id, location_id): """get the server which failed for all the 3 stages by location_id""" # query = ("SELECT B.SERVER_FULL_NAME FROM HANA_OS_MONITOR.M_MONITOR_CATALOG A " # "INNER JOIN HANA_OS_MONITOR.SERVER_INFO B ON A.SERVER_ID = B.SERVER_ID " # "WHERE A.STATUS = 'ERROR' AND A.CHECK_ID = '{0}' AND A.LOCATION_ID = {1} " # "GROUP BY B.SERVER_FULL_NAME HAVING COUNT(1) >= 3".format(check_id, location_id)) # commented on 2018/09/05 now only send the failing alert email at the first time after 8am of current day working_hour = Mc.get_operation_hours(self.__logger)[0] working_time = "0{0}".format( working_hour) if working_hour < 10 else "{0}".format(working_hour) query = ( "SELECT SERVER_FULL_NAME FROM ( SELECT B.SERVER_FULL_NAME, A.SERVER_ID, A.CHECK_ID, COUNT(1) " "AS FAILED_NUM FROM HANA_OS_MONITOR.M_MONITOR_CATALOG A " "INNER JOIN HANA_OS_MONITOR.SERVER_INFO B ON A.SERVER_ID = B.SERVER_ID " "WHERE A.END_TIME >= TO_TIMESTAMP(TO_NVARCHAR(CURRENT_TIMESTAMP, 'YYYY-MM-DD') " "|| ' {0}:00:00', 'YYYY-MM-DD HH24:MI:SS') " "AND A.STATUS = 'ERROR' AND A.CHECK_ID <= '{1}' AND A.LOCATION_ID = {2} " "GROUP BY B.SERVER_FULL_NAME, A.SERVER_ID, A.CHECK_ID HAVING COUNT(1) >= 3 ) C " "WHERE NOT EXISTS (SELECT 1 FROM HANA_OS_MONITOR.M_MONITOR_CATALOG D " "WHERE D.CHECK_ID > C.CHECK_ID AND D.STATUS <> 'ERROR' AND D.SERVER_ID = C.SERVER_ID)" "GROUP BY SERVER_FULL_NAME " "HAVING SUM(FAILED_NUM) >= 3 AND SUM(FAILED_NUM) <6".format( working_time, check_id, location_id)) return self.__query_select(query)
def publish(self): user = Mc.get_ssh_default_user() password = Mc.get_ssh_default_password() host_name = os.uname()[1] for server in self.servers: if host_name in server: Mu.log_debug(None, "Skipping local server on {0}".format(server)) continue with Mu.open_ssh_connection(None, self.__os_operator, server, user, password) as ssh: Mu.log_debug(None, "Publishing agent on {0}".format(server)) for file in self.files: # Currently, path for source and target is the same source = self.path + file target = self.path + file self.__os_operator.upload_file(ssh, source, target) Mu.log_debug(None, "Publishing agent on {0} is done".format(server))
def __init__(self): super().__init__() self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_APP) # currently, only support shutdown and log backup clean self.switcher = { ActionType.SHUTDOWN.value: AppOperator.__HANACloser(self.__logger), ActionType.CLEAN_LOG_BACKUP.value: AppOperator.__HANALogCleaner(self.__logger) } self.__app_operation_interval = Mc.get_app_operation_check_interval()
def test_cleaning(self): operator, mock_os_operator, mock_consumer = self.__get_mock_operator() mock_consumer.poll.return_value = { "test_topic": self.__get_mock_msg_list([self.cleaning_msg]) } operator._AppOperator__operate(mock_consumer) Mu.open_ssh_connection.assert_called_once_with( ANY, ANY, self.server_name, self.user_name, Mc.get_ssh_default_password()) mock_os_operator.clean_log_backup.assert_called_once()
def __init__(self): super().__init__() self.__db_operator = AlarmOperator.__HANAOperatorService() self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_ALARM) self.__heartbeat_interval = Mc.get_heartbeat_check_interval() self.__heartbeat_timeout = Mc.get_heartbeat_timeout() self.__heartbeat_email_interval = Mc.get_heartbeat_email_interval() self.cpu_threshold = 0 self.mem_threshold = 0 self.disk_threshold = 0 self.email_sender = "" self.operation_time = "" self.max_failure_times = 3 self.mem_emergency_threshold = 0 self.check_interval = 0 # the interval for sending email / performing emergency shutdown self.servers = [] self.__producer = Ku.get_producer() self.__topic = Mc.TOPIC_APP_OPERATION self.__heartbeat_email_info = {}
def test_initialize_of_coordinator(self): coordinator, mock_os_operator, mock_consumer = self.__get_mock_objects( ) # mock the heartbeat function coordinator._MonitorCoordinator__process_heartbeat = MagicMock( return_value=None) # start coordinator coordinator._MonitorCoordinator__coordinating_monitors( self.__get_mock_msg_list([self.config_msg])) # should tried to connect the two servers Mu.open_ssh_connection.assert_has_calls([ call(ANY, ANY, self.server1, Mc.get_ssh_default_user(), Mc.get_ssh_default_password()), call(ANY, ANY, self.server2, Mc.get_ssh_default_user(), Mc.get_ssh_default_password()) ]) # should tried to start agent on two servers mock_os_operator.restart_agent.assert_has_calls([ call(ANY, 1, "/usr/sap", Mc.get_agent_path(), self.config_msg[self.mem_interval], self.config_msg[self.cpu_interval], self.config_msg[self.disk_interval], self.config_msg[self.instance_interval]), call(ANY, 2, "/usr/sap", Mc.get_agent_path(), self.config_msg[self.mem_interval], self.config_msg[self.cpu_interval], self.config_msg[self.disk_interval], self.config_msg[self.instance_interval]) ])
def operate(self, parameter): server = parameter[Mc.FIELD_SERVER_FULL_NAME] user = parameter[Mc.FIELD_USER_NAME] with Mu.open_ssh_connection(self._logger, self._os_operator, server, user, Mc.get_ssh_default_password()) as ssh: if ssh is None: # TODO: notify alarm operator because of the non-standard password ?? Mu.log_warning( self._logger, "Failed to log in {0} with user {1}".format( server, user)) else: Mu.log_debug( self._logger, "Trying shutdown HANA on {0} for user {1}".format( server, user)) self._os_operator.shutdown_hana(ssh)
def __restart_agent_via_server_id(self, server_id): pre_time = self.__heartbeat_agent_restart_info.get( server_id, datetime.min) cur_time = datetime.now() if (cur_time - pre_time ).total_seconds() >= self.__heartbeat_restart_agent_interval: servers = [ s for s in self.__configs.get(Mc.DB_CONFIGURATION_SERVER, []) if s[Mc.FIELD_SERVER_ID] == server_id ] for server in servers: # update restart time before restarting # because restart agent takes more than 2 minutes if server is no responding self.__heartbeat_agent_restart_info[server_id] = datetime.now() Mu.log_info( self.__logger, "Restarting agent on {0}.".format( server[Mc.FIELD_SERVER_FULL_NAME])) self.__restart_agent( server[Mc.FIELD_SERVER_FULL_NAME], server[Mc.FIELD_SERVER_ID], server[Mc.FIELD_MOUNT_POINT], Mc.get_agent_path(), self.__configs.get("CHECK_INTERVAL_MEM_INT", 60), self.__configs.get("CHECK_INTERVAL_CPU_INT", 300), self.__configs.get("CHECK_INTERVAL_DISK_INT", 3600), self.__configs.get("CHECK_INTERVAL_INSTANCE_INT", 300)) Mu.log_info( self.__logger, "Restarting agent on {0} is finished.".format( server[Mc.FIELD_SERVER_FULL_NAME])) else: Mu.log_info(self.__logger, ( "heartbeat failed for {0}, but did not try to restart agent due to the " "configured operation interval time ({1}). (pre: {2}, cur: {3})" ).format(server_id, self.__heartbeat_restart_agent_interval, pre_time.strftime("%Y-%m-%d %H:%M:%S"), cur_time.strftime("%Y-%m-%d %H:%M:%S")))
def __coordinating_monitors(self, consumer): """ Coordinating (start/stop/restart) all the agents :param consumer: kafka consumer """ Mu.log_debug(self.__logger, "Coordinator is listening on topic for configurations.") for msg in consumer: try: Mu.log_debug(self.__logger, "New configs are coming...") if self.__update_configs(msg.value): # start/restart all agents, current design is restart all agents if any config is changed servers = self.__configs.get(Mc.DB_CONFIGURATION_SERVER, []) for server in servers: self.__restart_agent( server[Mc.FIELD_SERVER_FULL_NAME], server[Mc.FIELD_SERVER_ID], server[Mc.FIELD_MOUNT_POINT], Mc.get_agent_path(), self.__configs.get("CHECK_INTERVAL_MEM_INT", 60), self.__configs.get("CHECK_INTERVAL_CPU_INT", 300), self.__configs.get("CHECK_INTERVAL_DISK_INT", 3600), self.__configs.get("CHECK_INTERVAL_INSTANCE_INT", 300)) if self.__check_configuration() and not self.__heartbeat_flag: self.__heartbeat_flag = True # start heart beat thread heartbeat_thread = threading.Thread( target=self.__process_heartbeat) heartbeat_thread.start() except Exception as ex: Mu.log_warning_exc( self.__logger, "Error occurred when coordinating the monitors, Err: {0}". format(ex))
def initialize(self): self._db_operator.execute_from_script(Mc.get_init_sql_file())