def check_log_analytics_endpts(): success = NO_ERROR # get OMS endpoint to check if fairfax region oms_endpt = geninfo_lookup('OMS_ENDPOINT') if (oms_endpt == None): error_info.append(('OMS endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING # get log analytics endpoints if ('.us' in oms_endpt): log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \ "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \ "*.oms.opinsights.azure.us"] else: log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \ "ods.systemcenteradvisor.com"] for endpt in log_analytics_endpts: # replace '*' with workspace ID if ('*' in endpt): endpt = endpt.replace('*', workspace_id) # ping endpoint if (not check_endpt(endpt)): error_info.append((endpt, )) success = ERR_ENDPT return success
def check_log_analytics_endpts(): success = NO_ERROR no_certs_printed = False # get OMS endpoint to check if fairfax region oms_endpt = geninfo_lookup('OMS_ENDPOINT') if (oms_endpt == None): error_info.append(('OMS endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING # get log analytics endpoints if ('.us' in oms_endpt): log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \ "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \ "*.oms.opinsights.azure.us"] else: log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \ "ods.systemcenteradvisor.com"] for endpt in log_analytics_endpts: ssl_command = SSL_CMD # replace '*' with workspace ID if ('*' in endpt): endpt = endpt.replace('*', workspace_id) # check endpoint without certs if (not check_endpt_ssl(ssl_command, endpt)): # try with certs (if they exist) if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)): ssl_command = "{0} -cert {1} -key {2}".format( SSL_CMD, CERT_PATH, KEY_PATH) if (not check_endpt_ssl(ssl_command, endpt)): error_info.append((endpt, ssl_command.format(endpt))) success = ERR_ENDPT else: # lets user know cert and key aren't there if (not no_certs_printed): print( "NOTE: Certificate and key files don't exist, OMS isn't onboarded." ) no_certs_printed = True error_info.append((endpt, ssl_command.format(endpt))) success = ERR_ENDPT return success
def check_agent_service_endpt(): ssl_command = SSL_CMD # get endpoint dsc_endpt = geninfo_lookup('DSC_ENDPOINT') if (dsc_endpt == None): error_info.append(('DSC (agent service) endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING agent_endpt = dsc_endpt.split('/')[2] # check without certs if (check_endpt_ssl(ssl_command, agent_endpt)): return NO_ERROR else: # try with certs (if they exist) if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)): ssl_command = "{0} -cert {1} -key {2}".format( SSL_CMD, CERT_PATH, KEY_PATH) if (check_endpt_ssl(ssl_command, agent_endpt)): return NO_ERROR else: # lets user know cert and key aren't there print( "NOTE: Certificate and key files don't exist, OMS isn't onboarded." ) error_info.append((agent_endpt, ssl_command.format(agent_endpt))) return ERR_ENDPT
def get_package_version(pkg): pkg_mngr = geninfo_lookup('PKG_MANAGER') # dpkg if (pkg_mngr == 'dpkg'): return get_dpkg_pkg_version(pkg) # rpm elif (pkg_mngr == 'rpm'): return get_rpm_pkg_version(pkg) else: return None
def check_agent_service_endpt(): dsc_endpt = geninfo_lookup('DSC_ENDPOINT') if (dsc_endpt == None): error_info.append(('DSC (agent service) endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING agent_endpt = dsc_endpt.split('/')[2] if (check_endpt(agent_endpt)): return NO_ERROR else: error_info.append((agent_endpt, "couldn't ping endpoint")) return ERR_ENDPT
def check_sys(service): controller = geninfo_lookup('SERVICE_CONTROLLER') # systemctl if (controller.endswith('systemctl')): return check_sys_systemctl(service, controller) # invoke-rc.d elif (controller.endswith('invoke-rc.d')): return check_sys_invoke_rc(service, controller) # service elif (controller.endswith('service')): return check_sys_service(service, controller) # no service controller else: return ERR_SERVICE_CONTROLLER
def get_pkg_ver(pkg): version = None try: pkg_manager = geninfo_lookup('PKG_MANAGER') # check using rpm if (pkg_manager == 'rpm'): pkg_info = subprocess.check_output(['rpm', '-qi', pkg], universal_newlines=True,\ stderr=subprocess.STDOUT) for line in pkg_info.split('\n'): # parse line parsed_line = line.split(': ') if (len(parsed_line > 2)): parsed_line = [parsed_line[0]] + [parsed_line[1:].join(': ')] # check info if (parsed_line[0].startswith('Name') and parsed_line[1] != pkg): # wrong package return None if (parsed_line[0].startswith('Version')): version = parsed_line[1] continue if (parsed_line[0].startswith('Release')): version = version + '-' + parsed_line[1] break # check using dpkg elif (pkg_manager == 'dpkg'): pkg_info = subprocess.check_output(['dpkg', '-s', pkg], universal_newlines=True,\ stderr=subprocess.STDOUT) for line in pkg_info.split('\n'): # parse line parsed_line = line.split(': ') if (len(parsed_line > 2)): parsed_line = [parsed_line[0]] + [parsed_line[1:].join(': ')] # check info if (parsed_line[0] == 'Package' and parsed_line[1] != pkg): # wrong package return None if (parsed_line[0] == 'Version'): version = parsed_line[1] break return version # no pkg except subprocess.CalledProcessError: return None
def check_oms(interactive): cpu_bits = geninfo_lookup('CPU_BITS') oms_version = get_oms_version() if (oms_version == None): return ERR_OMS_INSTALL # check if version is >= 1.11 if (not comp_versions_ge(oms_version, '1.11')): error_info.append((oms_version, cpu_bits)) return ERR_OLD_OMS_VER # get most recent version (curr_oms_version, e) = get_curr_oms_version(OMSAGENT_URL) # getting current version failed if (curr_oms_version == None): # could connect, just formatting issue if (e == None): return ERR_GETTING_OMS_VER # couldn't connect else: checked_urlopen = check_urlopen_errs(e) # issue with connecting to Github specifically if (checked_urlopen == ERR_ENDPT): print( "WARNING: can't connect to {0}: {1}\n Skipping this check..." .format(OMSAGENT_URL, e)) print( "--------------------------------------------------------------------------------" ) # issue with general internet connectivity / ssl package else: error_info.append((OMSAGENT_URL, e)) return checked_urlopen # got current version else: # if not most recent version, ask if want to update if (interactive and (not comp_versions_ge(oms_version, curr_oms_version))): if (ask_update_old_version(oms_version, curr_oms_version, cpu_bits) == USER_EXIT): return USER_EXIT return update_omsadmin()
def check_syslogdest(sys_bind, sys_pt): # get workspace id workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING # set up regex lines comment_line = "# OMS Syslog collection for workspace (\S+)" spec_line = "(\w+).=alert;(\w+).=crit;(\w+).=debug;(\w+).=emerg;(\w+).=err;"\ "(\w+).=info;(\w+).=notice;(\w+).=warning" # open file with open(SYSLOGDEST_PATH, 'r') as syslogdest_file: for line in syslogdest_file: line = line.rstrip('\n') # skip empty lines if (line == ''): continue # check if workspace for syslog collection lines up match_comment = re.match(comment_line, line) if (match_comment == None): continue syslog_wkspc = (match_comment.groups())[0] if (workspace_id != syslog_wkspc): error_info.append( (syslog_wkspc, workspace_id, SYSLOGCONF_PATH)) return ERR_SYSLOG_WKSPC else: continue # check if port is correct parsed_line = line.split() match_spec = re.match(spec_line, parsed_line[0]) if (match_comment != None): checked_port = check_port(parsed_line[1], sys_port, sys_bind) if (checked_port != NO_ERROR): return checked_port else: continue else: continue # all ports set up correctly return NO_ERROR
def check_log_rotation(): # update logrotate config path with wsid workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING global LR_CONFIG_PATH LR_CONFIG_PATH = LR_CONFIG_PATH.format(workspace_id) # check logrotate config file exists if (not os.path.isfile(LR_CONFIG_PATH)): error_info.append(('logrotate config file', LR_CONFIG_PATH)) return ERR_FILE_MISSING # go through logrotate config file logrotate_configs = dict() with open(LR_CONFIG_PATH, 'r') as f: lr_lines = f.readlines() in_file = None for lr_line in lr_lines: lr_line = lr_line.rstrip('\n') # start of log rotation config lr_start = re.match("^(\S+) \{$", lr_line) if (lr_start != None): in_file = lr_start.groups()[0] logrotate_configs[in_file] = set() continue # log rotation config info elif (in_file != None): logrotate_configs[in_file].add(lr_line.lstrip()) continue # end of log rotation config elif (lr_line == '}'): in_file = None continue # check size rotation working checked_size_config = check_size_config(logrotate_configs) if (checked_size_config != NO_ERROR): return checked_size_config return NO_ERROR
def check_agent_service_endpt(): # get endpoint dsc_endpt = geninfo_lookup('DSC_ENDPOINT') if (dsc_endpt == None): error_info.append(('DSC (agent service) endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING agent_endpt = dsc_endpt.split('/')[2] # check without certs (dsc_connected, dsc_verified) = check_endpt_ssl(SSL_CMD, agent_endpt) if (dsc_connected and dsc_verified): return NO_ERROR else: # try with certs (if they exist) if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)): ssl_command = "{0} -cert {1} -key {2}".format( SSL_CMD, CERT_PATH, KEY_PATH) (dsc_cert_connected, dsc_cert_verified) = check_endpt_ssl(ssl_command, agent_endpt) # with certs connected and verified if (dsc_cert_connected and dsc_cert_verified): return NO_ERROR # with certs connected, but didn't verify elif (dsc_cert_connected and not dsc_cert_verified): error_info.append( (agent_endpt, ssl_command.format(agent_endpt))) return WARN_ENDPT else: # lets user know cert and key aren't there print( "NOTE: Certificate and key files don't exist, OMS isn't onboarded." ) # if certs didn't work at all, check to see if no certs was connected (but not verified) if (dsc_connected and not dsc_verified): error_info.append((agent_endpt, SSL_CMD.format(agent_endpt))) return WARN_ENDPT # neither with nor without certs connected error_info.append((agent_endpt, SSL_CMD.format(agent_endpt))) return ERR_ENDPT
def check_conf_files(): # verify syslog.conf exists / not empty if (not os.path.isfile(SYSLOGCONF_PATH)): error_info.append(('file', SYSLOGCONF_PATH)) return ERR_FILE_MISSING if (os.stat(SYSLOGCONF_PATH).st_size == 0): error_info.append((SYSLOGCONF_PATH, )) return ERR_FILE_EMPTY # update syslog destination path with correct location syslog_dest = geninfo_lookup('SYSLOG_DEST') if (syslog_dest == None): return ERR_SYSLOG global SYSLOGDEST_PATH SYSLOGDEST_PATH = syslog_dest # verify syslog destination exists / not empty if (not os.path.isfile(SYSLOGDEST_PATH)): error_info.append(('file', SYSLOGDEST_PATH)) return ERR_FILE_MISSING if (os.stat(SYSLOGDEST_PATH).st_size == 0): error_info.append((SYSLOGDEST_PATH, )) return ERR_FILE_EMPTY # parse syslog.conf syslogconf_dict = parse_syslogconf() if (not syslogconf_dict): error_info.append(("syslog configuration info", SYSLOGCONF_PATH)) return ERR_INFO_MISSING # get info for checking syslog destination file try: sys_bind = syslogconf_dict['bind'] sys_pt = syslogconf_dict['protocol_type'] except KeyError: error_info.append(("syslog configuration info", SYSLOGCONF_PATH)) return ERR_INFO_MISSING # check with syslog destination file return check_syslogdest(sys_bind, sys_pt)
def check_high_cpu_memory(interactive, prev_success=NO_ERROR): print("CHECKING FOR HIGH CPU / MEMORY USAGE...") success = prev_success # check if installed / connected / running correctly print("Checking if omsagent installed and running...") # check installation if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print("Running the installation part of the troubleshooter in order to find the issue...") print("================================================================================") return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # check connection checked_la_endpts = check_log_analytics_endpts() if (checked_la_endpts != NO_ERROR): print_errors(checked_la_endpts) print("Running the connection part of the troubleshooter in order to find the issue...") print("================================================================================") return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check running workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running != NO_ERROR): print_errors(checked_omsagent_running) print("Running the general health part of the troubleshooter in order to find the issue...") print("================================================================================") return check_heartbeat(interactive, prev_success=ERR_FOUND) # TODO: decide if should keep this in or not # check disk space # print("Checking recent modifications to largest files...") # checked_disk_space = check_disk_space() # if (checked_disk_space != NO_ERROR): # return print_errors(checked_disk_space) # check log rotation print("Checking if log rotation is working correctly...") checked_logrot = check_log_rotation() if (is_error(checked_logrot)): return print_errors(checked_logrot) else: success = print_errors(checked_logrot) # check CPU capacity print("Checking if OMI is at 100% CPU (may take some time)...") checked_highcpu = check_omi_cpu() if (is_error(checked_highcpu)): return print_errors(checked_highcpu) else: success = print_errors(checked_highcpu) # check slab memory / dentry cache issue print("Checking slab memory / dentry cache usage...") checked_slabmem = check_slab_memory() if (is_error(checked_slabmem)): return print_errors(checked_slabmem) else: success = checked_slabmem return success
def check_heartbeat(interactive, prev_success=NO_ERROR): print("CHECKING HEARTBEAT / HEALTH...") success = prev_success # TODO: run `sh /opt/microsoft/omsagent/bin/omsadmin.sh -l` to check if onboarded and running # check if installed correctly print("Checking if installed correctly...") if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_CONF_PATH)) print_errors(ERR_INFO_MISSING) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check if running multi-homing print("Checking if omsagent is trying to run multihoming...") checked_multihoming = check_multihoming(workspace_id) if (is_error(checked_multihoming)): return print_errors(checked_multihoming) else: success = print_errors(checked_multihoming) # TODO: check if other agents are sending heartbeats # check if omsagent is running print("Checking if omsagent is running...") checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running == ERR_OMS_WONT_RUN): # try starting omsagent # TODO: find better way of doing this, check to see if agent is stopped / grab results checked_omsagent_running = start_omsagent(workspace_id) if (is_error(checked_omsagent_running)): return print_errors(checked_omsagent_running) else: success = print_errors(checked_omsagent_running) # check if omsagent.log finds any heartbeat errors print("Checking for errors in omsagent.log...") checked_log_hb = check_log_heartbeat(workspace_id) if (is_error(checked_log_hb)): # connection issue if (checked_log_hb == ERR_HEARTBEAT): print_errors(checked_log_hb) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(err_codes=False, prev_success=ERR_FOUND) # other issue else: return print_errors(checked_log_hb) else: success = print_errors(checked_log_hb) return success
def check_custom_logs(interactive, prev_success=NO_ERROR): if (interactive): print(" To check if you are using custom logs, please go to https://ms.portal.azure.com\n"\ " and navigate to your workspace. Once there, please navigate to the 'Advanced\n"\ " settings' blade, and then go to 'Data' > 'Custom Logs'. There you should be\n"\ " to see any custom logs you may have.\n") using_cl = get_input("Are you currently using custom logs? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") # not using custom logs if (using_cl in ['n', 'no']): print("Continuing on with the rest of the troubleshooter...") print( "================================================================================" ) return prev_success # using custom logs else: print("Continuing on with troubleshooter...") print( "--------------------------------------------------------------------------------" ) print("CHECKING FOR CUSTOM LOG ISSUES...") success = prev_success # check if installed / connected / running correctly print("Checking if omsagent installed and running...") # check installation if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # check connection checked_la_endpts = check_log_analytics_endpts() if (checked_la_endpts != NO_ERROR): print_errors(checked_la_endpts) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check running workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running != NO_ERROR): print_errors(checked_omsagent_running) print( "Running the general health part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_heartbeat(interactive, prev_success=ERR_FOUND) # check customlog.conf print("Checking for custom log configuration files...") checked_clconf = check_customlog_conf(interactive) if (is_error(checked_clconf)): return print_errors(checked_clconf) else: success = print_errors(checked_clconf) return success
def check_syslog(interactive, prev_success=NO_ERROR): print("CHECKING FOR SYSLOG ISSUES...") success = prev_success # check if installed / connected / running correctly print("Checking if omsagent installed and running...") # check installation if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # check connection checked_la_endpts = check_log_analytics_endpts() if (checked_la_endpts != NO_ERROR): print_errors(checked_la_endpts) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check running workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running != NO_ERROR): print_errors(checked_omsagent_running) print( "Running the general health part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_heartbeat(interactive, prev_success=ERR_FOUND) # check for service controller print("Checking if machine has a valid service controller...") checked_sc = check_service_controller() if (is_error(checked_sc)): return checked_sc else: success = print_errors(checked_sc) # check rsyslog / syslogng running print("Checking if machine has rsyslog or syslog-ng running...") checked_services = check_services() if (is_error(checked_services)): return print_errors(checked_services) else: success = print_errors(checked_services) # check for syslog.conf and syslog destination file print("Checking for syslog configuration files...") checked_conf_files = check_conf_files() if (is_error(checked_conf_files)): if (checked_conf_files in [ERR_OMS_INSTALL, ERR_FILE_MISSING]): print_errors(checked_conf_files) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) else: return print_errors(checked_conf_files) else: success = print_errors(checked_conf_files) return success
def check_log_analytics_endpts(): success = NO_ERROR no_certs_printed = False connected_err = [] verified_err = [] # get OMS endpoint to check if fairfax region oms_endpt = geninfo_lookup('OMS_ENDPOINT') if (oms_endpt == None): error_info.append(('OMS endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING # get log analytics endpoints if ('.us' in oms_endpt): log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \ "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \ "*.oms.opinsights.azure.us"] else: log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \ "ods.systemcenteradvisor.com"] for endpt in log_analytics_endpts: # replace '*' with workspace ID if ('*' in endpt): endpt = endpt.replace('*', workspace_id) # check endpoint without certs (la_connected, la_verified) = check_endpt_ssl(SSL_CMD, endpt) if (not (la_connected or la_verified)): # try with certs (if they exist) if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)): ssl_command = "{0} -cert {1} -key {2}".format( SSL_CMD, CERT_PATH, KEY_PATH) (la_cert_connected, la_cert_verified) = check_endpt_ssl(ssl_command, endpt) # didn't connect or verify with certs if (not (la_cert_connected or la_cert_verified)): connected_err.append((endpt, ssl_command.format(endpt))) success = ERR_ENDPT # connected but didn't verify with certs elif (la_cert_connected and not la_cert_verified): # haven't run into a connected error already if (success != ERR_ENDPT): verified_err.append((endpt, ssl_command.format(endpt))) success = WARN_ENDPT else: # lets user know cert and key aren't there if (not no_certs_printed): print( "NOTE: Certificate and key files don't exist, OMS isn't onboarded." ) no_certs_printed = True # if certs didn't work at all, check to see if no certs was connected (but not verified) if (la_connected and not la_verified): # haven't run into a connected error already if (success != ERR_ENDPT): verified_err.append((endpt, SSL_CMD.format(endpt))) success = WARN_ENDPT # neither with nor without certs connected connected_err.append((endpt, SSL_CMD.format(endpt))) success = ERR_ENDPT # if any connection issues found if (success == ERR_ENDPT): error_info.extend(connected_err) # if no connection issues found but some verification issues found elif (success == WARN_ENDPT): error_info.extend(verified_err) return success