def check_connection(interactive, err_codes=True, prev_success=NO_ERROR): print("CHECKING CONNECTION...") success = prev_success if (interactive and err_codes): if (ask_onboarding_error_codes() == USER_EXIT): return USER_EXIT # check if installed correctly print("Checking if installed correctly...") if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # check general internet connectivity print("Checking if machine is connected to the internet...") checked_internet_connect = check_internet_connect() if (is_error(checked_internet_connect)): return print_errors(checked_internet_connect) else: success = print_errors(checked_internet_connect) # check if agent service endpoint connected print("Checking if agent service endpoint is connected...") checked_as_endpt = check_agent_service_endpt() if (is_error(checked_as_endpt)): return print_errors(checked_as_endpt) else: success = print_errors(checked_as_endpt) # check if log analytics endpoints connected print("Checking if log analytics endpoints are connected...") checked_la_endpts = check_log_analytics_endpts() if (is_error(checked_la_endpts)): return print_errors(checked_la_endpts) else: success = print_errors(checked_la_endpts) # check if queries are successful if (interactive): print("Checking if queries are successful...") checked_e2e = check_e2e() if (is_error(checked_e2e)): return print_errors(checked_e2e) else: success = print_errors(checked_e2e) return success
def geninfo_lookup(key): try: val = general_info[key] except KeyError: updated_geninfo = update_geninfo_all() if (updated_geninfo != NO_ERROR): print_errors(updated_geninfo) return None val = general_info[key] if (val == ''): return None return val
def check_syslog(interactive, prev_success=NO_ERROR): print("CHECKING FOR SYSLOG ISSUES...") success = prev_success # check if installed / connected / running correctly print("Checking if omsagent installed and running...") # check installation if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # check connection checked_la_endpts = check_log_analytics_endpts() if (checked_la_endpts != NO_ERROR): print_errors(checked_la_endpts) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check running workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running != NO_ERROR): print_errors(checked_omsagent_running) print( "Running the general health part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_heartbeat(interactive, prev_success=ERR_FOUND) # check for service controller print("Checking if machine has a valid service controller...") checked_sc = check_service_controller() if (is_error(checked_sc)): return checked_sc else: success = print_errors(checked_sc) # check rsyslog / syslogng running print("Checking if machine has rsyslog or syslog-ng running...") checked_services = check_services() if (is_error(checked_services)): return print_errors(checked_services) else: success = print_errors(checked_services) # check for syslog.conf and syslog destination file print("Checking for syslog configuration files...") checked_conf_files = check_conf_files() if (is_error(checked_conf_files)): if (checked_conf_files in [ERR_OMS_INSTALL, ERR_FILE_MISSING]): print_errors(checked_conf_files) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) else: return print_errors(checked_conf_files) else: success = print_errors(checked_conf_files) return success
def check_installation(interactive, err_codes=True, prev_success=NO_ERROR): print("CHECKING INSTALLATION...") # keep track of if all tests have been successful success = prev_success if (interactive and err_codes): if (ask_install_error_codes() == USER_EXIT): return USER_EXIT # check OS print("Checking if running a supported OS version...") checked_os = check_os() if (is_error(checked_os)): return print_errors(checked_os) else: success = print_errors(checked_os) # check space available print("Checking if enough disk space is available...") checked_space = check_space() if (is_error(checked_space)): return print_errors(checked_space) else: success = print_errors(checked_space) # check package manager print("Checking if machine has a supported package manager...") checked_pkg_manager = update_pkg_manager() if (is_error(checked_pkg_manager)): return print_errors(checked_pkg_manager) else: success = print_errors(checked_pkg_manager) # check packages are installed print("Checking if packages installed correctly...") checked_packages = check_packages() if (is_error(checked_packages)): return print_errors(checked_packages) else: success = print_errors(checked_packages) # check OMS version print("Checking if running a supported version of OMS...") checked_oms = check_oms(interactive) if (is_error(checked_oms)): return print_errors(checked_oms) else: success = print_errors(checked_oms) # check all files if (os.path.isdir(DFS_PATH)): print( "Checking if all files installed correctly (may take some time)..." ) checked_files = check_filesystem(DFS_PATH) if (is_error(checked_files)): return print_errors(checked_files) else: success = print_errors(checked_files) else: print( "WARNING (INTERNAL): Datafiles have not been successfully copied over." ) print("Skipping all files installed correctly check...") # check certs print("Checking certificate and RSA key are correct...") # check cert checked_cert = check_cert() if (checked_cert != NO_ERROR): success = print_errors(checked_cert) # check key checked_key = check_key() if (checked_key != NO_ERROR): success = print_errors(checked_key) # return if at least one is false if (is_error(checked_cert) or is_error(checked_key)): return ERR_FOUND return success
def check_custom_logs(interactive, prev_success=NO_ERROR): if (interactive): print(" To check if you are using custom logs, please go to https://ms.portal.azure.com\n"\ " and navigate to your workspace. Once there, please navigate to the 'Advanced\n"\ " settings' blade, and then go to 'Data' > 'Custom Logs'. There you should be\n"\ " to see any custom logs you may have.\n") using_cl = get_input("Are you currently using custom logs? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") # not using custom logs if (using_cl in ['n', 'no']): print("Continuing on with the rest of the troubleshooter...") print( "================================================================================" ) return prev_success # using custom logs else: print("Continuing on with troubleshooter...") print( "--------------------------------------------------------------------------------" ) print("CHECKING FOR CUSTOM LOG ISSUES...") success = prev_success # check if installed / connected / running correctly print("Checking if omsagent installed and running...") # check installation if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # check connection checked_la_endpts = check_log_analytics_endpts() if (checked_la_endpts != NO_ERROR): print_errors(checked_la_endpts) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check running workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running != NO_ERROR): print_errors(checked_omsagent_running) print( "Running the general health part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_heartbeat(interactive, prev_success=ERR_FOUND) # check customlog.conf print("Checking for custom log configuration files...") checked_clconf = check_customlog_conf(interactive) if (is_error(checked_clconf)): return print_errors(checked_clconf) else: success = print_errors(checked_clconf) return success
def check_heartbeat(interactive, prev_success=NO_ERROR): print("CHECKING HEARTBEAT / HEALTH...") success = prev_success # TODO: run `sh /opt/microsoft/omsagent/bin/omsadmin.sh -l` to check if onboarded and running # check if installed correctly print("Checking if installed correctly...") if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_CONF_PATH)) print_errors(ERR_INFO_MISSING) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check if running multi-homing print("Checking if omsagent is trying to run multihoming...") checked_multihoming = check_multihoming(workspace_id) if (is_error(checked_multihoming)): return print_errors(checked_multihoming) else: success = print_errors(checked_multihoming) # TODO: check if other agents are sending heartbeats # check if omsagent is running print("Checking if omsagent is running...") checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running == ERR_OMS_WONT_RUN): # try starting omsagent # TODO: find better way of doing this, check to see if agent is stopped / grab results checked_omsagent_running = start_omsagent(workspace_id) if (is_error(checked_omsagent_running)): return print_errors(checked_omsagent_running) else: success = print_errors(checked_omsagent_running) # check if omsagent.log finds any heartbeat errors print("Checking for errors in omsagent.log...") checked_log_hb = check_log_heartbeat(workspace_id) if (is_error(checked_log_hb)): # connection issue if (checked_log_hb == ERR_HEARTBEAT): print_errors(checked_log_hb) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(err_codes=False, prev_success=ERR_FOUND) # other issue else: return print_errors(checked_log_hb) else: success = print_errors(checked_log_hb) return success
def check_high_cpu_memory(interactive, prev_success=NO_ERROR): print("CHECKING FOR HIGH CPU / MEMORY USAGE...") success = prev_success # check if installed / connected / running correctly print("Checking if omsagent installed and running...") # check installation if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print("Running the installation part of the troubleshooter in order to find the issue...") print("================================================================================") return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # check connection checked_la_endpts = check_log_analytics_endpts() if (checked_la_endpts != NO_ERROR): print_errors(checked_la_endpts) print("Running the connection part of the troubleshooter in order to find the issue...") print("================================================================================") return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check running workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running != NO_ERROR): print_errors(checked_omsagent_running) print("Running the general health part of the troubleshooter in order to find the issue...") print("================================================================================") return check_heartbeat(interactive, prev_success=ERR_FOUND) # TODO: decide if should keep this in or not # check disk space # print("Checking recent modifications to largest files...") # checked_disk_space = check_disk_space() # if (checked_disk_space != NO_ERROR): # return print_errors(checked_disk_space) # check log rotation print("Checking if log rotation is working correctly...") checked_logrot = check_log_rotation() if (is_error(checked_logrot)): return print_errors(checked_logrot) else: success = print_errors(checked_logrot) # check CPU capacity print("Checking if OMI is at 100% CPU (may take some time)...") checked_highcpu = check_omi_cpu() if (is_error(checked_highcpu)): return print_errors(checked_highcpu) else: success = print_errors(checked_highcpu) # check slab memory / dentry cache issue print("Checking slab memory / dentry cache usage...") checked_slabmem = check_slab_memory() if (is_error(checked_slabmem)): return print_errors(checked_slabmem) else: success = checked_slabmem return success