def __process_heartbeat(self): # Mc.TOPIC_AGENT_HEARTBEAT, seek_to_end failed with no partition assigned, try manually assign consumer = Ku.get_consumer(Mc.MONITOR_GROUP_ID_COORDINATOR) # skip all previous messages, not care about past # consumer.assign([TopicPartition(topic=Mc.TOPIC_AGENT_HEARTBEAT, partition=0)]) # use assign instead subscribe because the error: https://github.com/dpkp/kafka-python/issues/601 Ku.assign_and_seek_to_end(consumer, Mc.TOPIC_AGENT_HEARTBEAT, Mc.TOPIC_AGENT_HEARTBEAT) # consumer.assign(Ku.get_assignments(consumer, [Mc.TOPIC_AGENT_HEARTBEAT])) # consumer.seek_to_end(*Ku.get_topic_partitions(consumer, Mc.TOPIC_AGENT_HEARTBEAT)) # consumer.seek_to_end() # init heartbeat_info for all servers heartbeat_info = { s[Mc.FIELD_SERVER_ID]: { InfoType.MEMORY.value: datetime.now() } for s in self.__configs.get(Mc.DB_CONFIGURATION_SERVER, []) } while True: try: Mu.process_heartbeat(self.__logger, heartbeat_info, consumer, self.__heartbeat_timeout, self.__restart_agent_via_server_id) except Exception as ex: Mu.log_warning_exc( self.__logger, "Error occurred when checking heartbeat, error: {0}". format(ex)) time.sleep(self.__heartbeat_interval)
def get_mem_consumers(self, server_name): """Get memory consumers for all users, contains top 100 memory consumers for every user """ os_output = HANAServerOSOperatorService.__exec( "cut -d: -f1 /etc/passwd | grep ^[A-Za-z0-9][A-Za-z0-9][A-Za-z0-9][a]dm$ " "| awk '{for (i=1;i<=NF;i++) {system(\"ps -Ao user,comm,pid,pmem | " "grep \"$i\" | sort -k 4 -nr | head -100\")}}'") if os_output is None: Mu.log_warning( self.__logger, "Can not get memory consumers of ({0}).".format(server_name)) mem_consumers = [] else: try: # replace ' <defunct>' to '<defunct>' to prevent from using <defunct> as process id mem_consumers = [ i.replace(' <defunct>', '<defunct>').split() for i in os_output ] mem_consumers = [{ Mc.FIELD_USER_NAME: i[0], Mc.FIELD_PROCESS_COMMAND: i[1], Mc.FIELD_PROCESS_ID: i[2], Mc.FIELD_MEM: i[3] } for i in mem_consumers if len(i) > 3] except Exception as ex: mem_consumers = [] Mu.log_warning( self.__logger, "Parsing output failed in 'get_mem_consumers' with error:{0}, " "server:{1}, the output:{2}".format( ex, server_name, os_output)) return mem_consumers
def collect_disk_info(self, server_name, mount_point): """collect disk info, including total size and unused size""" os_output = HANAServerOSOperatorService.__exec( "df -l| grep .*'\s'{0}$ | {1}".format( mount_point, "awk '{print $(NF-4) \" \" $(NF-2)}'")) if os_output is None: Mu.log_warning( self.__logger, "Can not get disk info for server:{0}, " "mount_point:{1}.".format(server_name, mount_point)) total_size = -1 unused_size = -1 else: try: results = os_output[0].split() total_size = float(results[0]) unused_size = float(results[1]) except Exception as ex: total_size = -1 unused_size = -1 Mu.log_warning( self.__logger, "Parsing output failed in 'collect_disk_info' with error: {0}, " "server: {1}, the output: {2}".format( ex, server_name, os_output)) return total_size, unused_size
def collect_mem_info(self, server_name): """ get the overall memory information for system""" os_output = HANAServerOSOperatorService.__exec( "free | tail -3 | xargs | awk '{print $2 \" \" $11}'") if not os_output: Mu.log_warning( self.__logger, "Can not get memory info for server:{0}.".format(server_name)) mem_total = -1 mem_free = -1 else: try: results = os_output[0].split() mem_total = int(results[0]) mem_free = int(results[1]) except Exception as ex: mem_total = -1 mem_free = -1 Mu.log_warning( self.__logger, "Parsing output failed in 'collect_mem_info' with error: {0}, " "server: {1}, the output: {2}".format( ex, server_name, os_output)) return mem_total, mem_free
def __ssh_exec_command(self, command, ssh, stdin_param=None, backend=False): if ssh is None: Mu.log_warning( self.__logger, "Skipped Command:{0} for empty ssh connection".format(command)) return try: # only enable get_pty without input parameters (the output contains '\r\h' when get_pty=true) pty_flag = False if stdin_param is None else True if backend: command = 'nohup bash -lc "{0}" >/dev/null 2>&1 &'.format( command) # execute command via SSH with 10 minutes timeout cmd_input, cmd_output, cmd_err = ssh.exec_command(command, timeout=600, get_pty=pty_flag) if stdin_param is not None: cmd_input.write(stdin_param) cmd_input.write("\n") cmd_input.flush() return cmd_output.readlines() except Exception as ex: server_name = "" if ssh is not None and hasattr(ssh, "remote_server_name_"): server_name = " on {0}".format(ssh.remote_server_name_) Mu.log_warning( self.__logger, "Command:{0} execution failed{1}, {2}".format( command, server_name, ex))
def exit_clean_up(logger, file): Mu.log_info(logger, "Removing the pid file {0}.".format(file)) try: os.unlink(file) except Exception as ex: Mu.log_warning(logger, "Failed to remove pid file, error: {0}.".format(ex))
def __get_users_info(self, server_id, check_id, consumers, info_type, free, total): users_info = self.__db_operator.get_users_info(server_id, consumers) if not users_info or len( users_info ) == 0 or Mc.FIELD_SERVER_FULL_NAME not in users_info[0]: Mu.log_warning( self.__logger, "Cannot get any users info for the server '{0}' type {1} of {2}" .format(server_id, info_type, consumers)) return [] # even no user matched, server full name will still be returned server_full_name = users_info[0][Mc.FIELD_SERVER_FULL_NAME] if len(users_info) >= 1: # maybe can't find any users in the system top_5_consumers = { Mc.MSG_TYPE: info_type, Mc.FIELD_SERVER_ID: server_id, Mc.FIELD_SERVER_FULL_NAME: server_full_name, Mc.FIELD_CHECK_ID: check_id, Mc.INFO_TOTAL: total, Mc.INFO_FREE: free, Mc.INFO_USAGE: users_info } return top_5_consumers
def close_ssh_connection(self, ssh): if ssh is not None: try: ssh.close() except Exception as ex: server_name = "" if ssh is not None and hasattr(ssh, "remote_server_name_"): server_name = " on server:{0}".format(ssh.remote_server_name_) Mu.log_error(self.__logger, "Failed to close SSH connection with error:{0}{1}.".format(ex, server_name))
def upload_file(self, ssh, source, target): ftp = ssh.open_sftp() try: ftp.put(source, target) except Exception as ex: Mu.log_error(self.__logger, "Upload file failed with error: {0}".format(ex)) finally: ftp.close()
def run(self): """run the thread""" while True: consumer = Ku.get_consumer(Mc.MONITOR_GROUP_ID_ANALYZER, Mc.TOPIC_MONITORING_INFO) self.__analyze(consumer) Mu.log_warning( self.__logger, "Topic is empty or connection is lost. Trying to reconnect...")
def run(self): """run the thread""" while True: consumer = Ku.get_consumer(Mc.MONITOR_GROUP_ID_DB_OPERATOR, Mc.TOPIC_FILTERED_INFO) self.__operate(consumer) Mu.log_warning( self.__logger, "Topic is empty or connection is lost. Trying to reconnect...")
def run(self): """run the thread""" while True: consumer = Ku.get_consumer(Mc.MONITOR_GROUP_ID_COORDINATOR, Mc.TOPIC_CONFIGURATION) self.__coordinating_monitors(consumer) Mu.log_warning( self.__logger, "Topic is empty or connection is lost. Trying to reconnect...")
def shutdown_hana(self, ssh): if Mu.is_test_mod(): Mu.log_debug(self.__logger, "It's in test mode, skip shutting down hana.") return cmd_output = self.__ssh_exec_command( 'nohup bash -lc "HDB stop" >/dev/null 2>&1 &', ssh) Mu.log_debug(self.__logger, "shutting down hana, output:{0}".format(cmd_output))
def get_host_name(self): try: host_name = os.uname()[1] except Exception as ex: Mu.log_warning( self.__logger, "Getting host name for server failed with error:{0}, .".format( ex)) host_name = "" return host_name
def get_email_admin(self, server_id): db_output = self.__monitor_dao.get_email_admin(server_id) try: administrators = [admin[0] for admin in db_output] except Exception as ex: administrators = [] Mu.log_warning( self.__logger, "Parsing DB output failed in 'get_email_admin' " "with error: {0}, the output: {1}".format(ex, db_output)) return administrators
def clean_log_backup(self, ssh, sid): if Mu.is_test_mod(): Mu.log_debug( self.__logger, "It's in test mode, skip cleaning log backup for {0}.".format( sid)) return self.__ssh_exec_command( 'find /usr/sap/{0}/HDB[0-9][0-9]/backup -name "log_backup_*.*" -mtime +10 -type f -delete' .format(sid), ssh) Mu.log_debug(self.__logger, "cleaned log backup for {0}.".format(sid))
def get_disk_consumers(self, server_name, mount_point): """ Get the disk consuming information for mount_point (default value is /usr/sap)""" os_output = HANAServerOSOperatorService.__exec( # exclude some folder because of hang issue of NFS # "du --exclude={0}/tmp --exclude={0}/temp --exclude={0}/shared --exclude=/usr/sap/eua_paths " # "--max-depth=1 {0} 2>>/dev/null".format(mount_point)) # changed to below solution at 2019/12/17, below solution only count the size of /usr/sap/SID # it's much more accurate "find {0} -maxdepth 1 -type d | egrep '^{0}/[A-Z][A-Z0-9][A-Z0-9]$' | " "xargs du -Lx --max-depth=0 2>>/dev/null".format(mount_point)) os_output_owners = [] if os_output is None: Mu.log_warning( self.__logger, "Can not get disk consumers for " "({0}:{1}).".format(server_name, mount_point)) disk_usage_info = [] else: try: # get owner of the all folders in mount_point os_output_owners = HANAServerOSOperatorService.__exec("".join([ "ls -ld {0}/* | awk ".format(mount_point), "'{print $3\"\t\"$NF}'" ])) # for filter purpose, add "/" at the end of mount_point mount_point = "".join([mount_point, "/"]) disk_usage_info = [ { Mc.FIELD_DISK_USAGE_KB: int(i.split()[0]), Mc.FIELD_FOLDER: i.split()[1][i.split( )[1].startswith(mount_point) and len(mount_point):], Mc.FIELD_USER_NAME: next((j.split()[0] for j in os_output_owners if len(j.split()) == 2 and i.split()[1] == j.split()[1]), '') } for i in os_output if len(i.split()) == 2 and mount_point in i.split()[1] ] except Exception as ex: disk_usage_info = [] Mu.log_warning( self.__logger, "Parsing SSH output failed in 'get_disk_consumers' with error: {0}, " "server: {1}, the output: {2}, " "owners: {3}".format(ex, server_name, os_output, os_output_owners)) return disk_usage_info
def run(self): """run the thread""" while True: consumer = Ku.get_consumer( Mc.MONITOR_GROUP_ID_ALARM ) # should be in different group with others # assign configuration first consumer.assign( Ku.get_assignments(consumer, Mc.TOPIC_CONFIGURATION)) self.__operate(consumer) Mu.log_warning( self.__logger, "Topic is empty or connection is lost. Trying to reconnect...")
def __get_servers(self, operator): # get all servers info from db db_output_servers = operator.get_server_full_names() try: servers = [server[1] for server in db_output_servers] except Exception as ex: Mu.log_warning( None, "Parsing DB output failed in 'get_server_full_names' with error: {0}, the output: {1}" .format(ex, db_output_servers)) servers = [] return servers
def __restart_agent(self, server, server_id, mount_point, agent_path, mem_interval, cpu_interval, disk_interval, instance_interval): with Mu.open_ssh_connection(self.__logger, self.__os_operator, server, Mc.get_ssh_default_user(), Mc.get_ssh_default_password()) as ssh: Mu.log_debug(self.__logger, "Restarting {0}".format(server)) self.__os_operator.restart_agent(ssh, server_id, mount_point, agent_path, mem_interval, cpu_interval, disk_interval, instance_interval) Mu.log_debug(self.__logger, "Restarting of {0} is done".format(server))
def __operate(self, consumer): """ poll from consumer, performing the related operation""" # for msg in consumer: # # {action: {Mc.FIELD_SERVER_FULL_NAME: server_name, Mc.FIELD_SID: sid, Mc.FIELD_USER_NAME: user_name}} # for action, info in msg.value: # if action in switcher: # Mu.log_info(self.__logger, "Trying to perform action: {0}...") # switcher[action].operate(info) # Mu.log_info(self.__logger, "Action: {0} is done.") app_opp_msg_pack = consumer.poll(update_offsets=True) if app_opp_msg_pack: for tp, messages in app_opp_msg_pack.items(): # {action: {Mc.FIELD_SERVER_FULL_NAME: server_name, Mc.FIELD_SID: sid, Mc.FIELD_USER_NAME: user_name}} for msg in messages: for action, info in msg.value.items(): action_type = int(action) if action_type in self.switcher: Mu.log_info( self.__logger, "Trying to perform action: {0}...".format( action)) try: Mu.log_debug(self.__logger, "Action detail: {0}".format(info)) self.switcher[action_type].operate(info) except Exception as ex: Mu.log_warning_exc( self.__logger, "Perform action failed with {0}, action detail is {1}" .format(ex, info)) Mu.log_info(self.__logger, "Action: {0} is done.".format(action))
def run(self): """run the thread""" operator = HANAMonitorDAO(Mc.get_hana_server(), Mc.get_hana_port(), Mc.get_hana_user(), Mc.get_hana_password()) while True: try: self.__monitoring_configurations(operator) except Exception as ex: Mu.log_warning_exc( self.__logger, "Error occurred when monitoring configuration, Error: {0}". format(ex)) time.sleep(self.__configs.get("CHECK_INTERVAL_CONFIG_INT", 300))
def __send_shutdown_message(self, server_name, sid, user_name): Mu.log_debug( self.__logger, "Sending shutdown message of {0} on {1} ...".format( sid, server_name)) # send shutdown message self.__producer.send( self.__topic, AlarmOperator.__generate_action_message(server_name, sid, user_name, ActionType.SHUTDOWN.value)) Mu.log_debug( self.__logger, "Shutdown message of {0} on {1} is sent".format(sid, server_name))
def get_cpu_consumers(self, server_name): """Get cpu consumers for all users, contains top 5 cpu consumers for every user """ os_output = HANAServerOSOperatorService.__exec( "cut -d: -f1 /etc/passwd | grep ^[A-Za-z0-9][A-Za-z0-9][A-Za-z0-9][a]dm$ " "| awk '{for (i=1;i<=NF;i++) {system(\"top -u \"$i\" -bn1 | sed -n '8,12'p\")}}'" ) if os_output is None: Mu.log_warning( self.__logger, "Can not get cpu consumers for ({0}).".format(server_name)) cpu_consumers = [] else: try: cpu_consumers = [{ Mc.FIELD_USER_NAME: i.split()[1], Mc.FIELD_PROCESS_COMMAND: i.split()[11], Mc.FIELD_PROCESS_ID: i.split()[0], Mc.FIELD_CPU: i.split()[8] } for i in os_output if len(i.split()) > 11] # In some system, some user might have the same user id (by the wrong setting), # which lead to the duplicate key issue. # remove duplicated records (list(set()) will not work, because CPU utilization may not the same) cpu_consumers_clean = [] for consumer in cpu_consumers: duplicate_flag = False for consumer_clean in cpu_consumers_clean: if consumer[Mc.FIELD_USER_NAME] == consumer_clean[Mc.FIELD_USER_NAME] and \ consumer[Mc.FIELD_PROCESS_COMMAND] == consumer_clean[Mc.FIELD_PROCESS_COMMAND] and \ consumer[Mc.FIELD_PROCESS_ID] == consumer_clean[Mc.FIELD_PROCESS_ID]: duplicate_flag = True break if duplicate_flag: continue else: cpu_consumers_clean.append(consumer) cpu_consumers = cpu_consumers_clean except Exception as ex: cpu_consumers = [] Mu.log_warning( self.__logger, "Parsing output failed in 'get_cpu_consumers' with error:{0}, " "server:{1}, the output:{2}".format( ex, server_name, os_output)) return cpu_consumers
def get_all_hana_instance_info(self, server_id, path=None): """get instance info for all hana instance""" os_output = self.__get_all_hana_instance_info(path) if os_output is None: Mu.log_warning( self.__logger, "Can not get hana instance info for server:{0}.".format( server_id)) hana_info = [] else: try: i = 0 hana_info = [] while i < len(os_output): if "HDB_ALONE" in os_output[i]: info = {Mc.FIELD_SID: os_output[i].split(" ")[0]} i += 1 while i < len( os_output) and "HDB_ALONE" not in os_output[i]: if re.match("HDB[0-9][0-9]", os_output[i].strip()): info[Mc.FIELD_INSTANCE_NO] = os_output[ i].strip()[3:].strip() elif re.match("hosts?", os_output[i].strip()): info[Mc.FIELD_HOST] = "{0} [{1}]".format( len(os_output[i].split(":") [1].strip().split(",")) if len(os_output[i].split(":")[1].strip()) > 0 else 0, os_output[i].split(":")[1].strip()) elif "version:" in os_output[i]: info[Mc.FIELD_REVISION] = os_output[i].split( " ")[1].strip() elif "edition:" in os_output[i]: info[Mc.FIELD_EDITION] = os_output[i].split( ":")[1].strip() i += 1 hana_info.append(info) else: # fixed the endless loop when the output is not valid i += 1 except Exception as ex: hana_info = [] Mu.log_warning( self.__logger, "Parsing output failed in 'get_all_hana_instance_info' with error:" " {0}, server: {1}, the output: {2}".format( ex, server_id, os_output)) return hana_info
def __init__(self): # implement the singleton class self.__monitor_dao = HANAMonitorDAO(Mc.get_hana_server(), Mc.get_hana_port(), Mc.get_hana_user(), Mc.get_hana_password()) self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_ALARM)
def __send_cleaning_message(self, server_name, sid, user_name): Mu.log_debug( self.__logger, "Sending log backup cleaning message of {0} on {1} for {2} ...". format(sid, server_name, user_name)) # send shutdown message self.__producer.send( self.__topic, AlarmOperator.__generate_action_message( server_name, sid, user_name, ActionType.CLEAN_LOG_BACKUP.value)) Mu.log_debug( self.__logger, "Log backup cleaning message of {0} on {1} for {2} is sent".format( sid, server_name, user_name))
def __operate_disk(self, info): server_id = info[Mc.FIELD_SERVER_ID] check_id = info[Mc.FIELD_CHECK_ID] disk_free = info[Mc.FIELD_DISK_FREE] disk_total = info[Mc.FIELD_DISK_TOTAL] free_disk_threshold = ( (100 - self.disk_threshold) * disk_total) / 100.0 Mu.log_debug( self.__logger, "Server:{0}, check_id:{1}, free disk:{2}, threshold:{3}".format( server_id, check_id, disk_free, free_disk_threshold)) if disk_free is None or disk_free < 0 or disk_total is None or disk_total <= 0 or self.disk_threshold <= 0: return # prepare all info if size of free memory < threshold if disk_free < free_disk_threshold: # {"folder": {"user1":3245}, "folder2":{"user2":222}, "folder3":{"user3":99999}} disk_consumers = list(info[Mc.MSG_INFO].items()) disk_consumers.sort(key=lambda v: next(iter(v[1].values())), reverse=True) # sort by desc del disk_consumers[5:] # only keep the top 5 users = [next(iter(folder[1].keys())) for folder in disk_consumers] top_5_consumers = self.__get_users_info(server_id, check_id, users, InfoType.DISK, disk_free, disk_total) # combine usage info folders_info = [] for folder in disk_consumers: folder_info = { Mc.FIELD_FOLDER: folder[0], Mc.FIELD_USER_NAME: next(iter(folder[1].keys())), Mc.FIELD_USAGE: next(iter(folder[1].values())) } for user_info in top_5_consumers.get(Mc.INFO_USAGE, []): if folder_info[Mc.FIELD_USER_NAME] == user_info[ Mc.FIELD_USER_NAME]: folder_info.update(user_info) folders_info.append(folder_info) top_5_consumers[Mc.INFO_USAGE] = folders_info return top_5_consumers
def open_ssh_connection(self, server_name, user_name, user_password): ssh = paramiko.SSHClient() try: ssh.load_system_host_keys() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) # attach customized attribute for server name, # because there is no official way to get the remote server name later. ssh.remote_server_name_ = server_name ssh.connect(server_name, username=user_name, password=user_password) except Exception as ex: if ssh is not None: ssh.close() Mu.log_error(self.__logger, "SSH connection error:{0}! (Server:{1}, User:{2})".format(ex, server_name, user_name)) # Mu.log_exception(traceback.format_exc()) ssh = None return ssh
def __init__(self): if HANAServerOSOperatorService.__instance is not None: raise MonitorOSOpError( "This class is a singleton, use HANAServerOSOperatorService.instance() instead" ) else: HANAServerOSOperatorService.__instance = self self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_SERVER_OS_OPERATOR)