Example #1
0
def check_connection(interactive, err_codes=True, prev_success=NO_ERROR):
    print("CHECKING CONNECTION...")

    success = prev_success

    if (interactive and err_codes):
        if (ask_onboarding_error_codes() == USER_EXIT):
            return USER_EXIT

    # check if installed correctly
    print("Checking if installed correctly...")
    if (get_oms_version() == None):
        print_errors(ERR_OMS_INSTALL)
        print(
            "Running the installation part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_installation(interactive,
                                  err_codes=False,
                                  prev_success=ERR_FOUND)

    # check general internet connectivity
    print("Checking if machine is connected to the internet...")
    checked_internet_connect = check_internet_connect()
    if (is_error(checked_internet_connect)):
        return print_errors(checked_internet_connect)
    else:
        success = print_errors(checked_internet_connect)

    # check if agent service endpoint connected
    print("Checking if agent service endpoint is connected...")
    checked_as_endpt = check_agent_service_endpt()
    if (is_error(checked_as_endpt)):
        return print_errors(checked_as_endpt)
    else:
        success = print_errors(checked_as_endpt)

    # check if log analytics endpoints connected
    print("Checking if log analytics endpoints are connected...")
    checked_la_endpts = check_log_analytics_endpts()
    if (is_error(checked_la_endpts)):
        return print_errors(checked_la_endpts)
    else:
        success = print_errors(checked_la_endpts)

    # check if queries are successful
    if (interactive):
        print("Checking if queries are successful...")
        checked_e2e = check_e2e()
        if (is_error(checked_e2e)):
            return print_errors(checked_e2e)
        else:
            success = print_errors(checked_e2e)

    return success
Example #2
0
def geninfo_lookup(key):
    try:
        val = general_info[key]
    except KeyError:
        updated_geninfo = update_geninfo_all()
        if (updated_geninfo != NO_ERROR):
            print_errors(updated_geninfo)
            return None
        val = general_info[key]
    if (val == ''):
        return None
    return val
Example #3
0
def check_syslog(interactive, prev_success=NO_ERROR):
    print("CHECKING FOR SYSLOG ISSUES...")

    success = prev_success

    # check if installed / connected / running correctly
    print("Checking if omsagent installed and running...")
    # check installation
    if (get_oms_version() == None):
        print_errors(ERR_OMS_INSTALL)
        print(
            "Running the installation part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_installation(interactive,
                                  err_codes=False,
                                  prev_success=ERR_FOUND)

    # check connection
    checked_la_endpts = check_log_analytics_endpts()
    if (checked_la_endpts != NO_ERROR):
        print_errors(checked_la_endpts)
        print(
            "Running the connection part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_connection(interactive,
                                err_codes=False,
                                prev_success=ERR_FOUND)

    # check running
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING
    checked_omsagent_running = check_omsagent_running(workspace_id)
    if (checked_omsagent_running != NO_ERROR):
        print_errors(checked_omsagent_running)
        print(
            "Running the general health part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_heartbeat(interactive, prev_success=ERR_FOUND)

    # check for service controller
    print("Checking if machine has a valid service controller...")
    checked_sc = check_service_controller()
    if (is_error(checked_sc)):
        return checked_sc
    else:
        success = print_errors(checked_sc)

    # check rsyslog / syslogng running
    print("Checking if machine has rsyslog or syslog-ng running...")
    checked_services = check_services()
    if (is_error(checked_services)):
        return print_errors(checked_services)
    else:
        success = print_errors(checked_services)

    # check for syslog.conf and syslog destination file
    print("Checking for syslog configuration files...")
    checked_conf_files = check_conf_files()
    if (is_error(checked_conf_files)):
        if (checked_conf_files in [ERR_OMS_INSTALL, ERR_FILE_MISSING]):
            print_errors(checked_conf_files)
            print(
                "Running the installation part of the troubleshooter in order to find the issue..."
            )
            print(
                "================================================================================"
            )
            return check_installation(interactive,
                                      err_codes=False,
                                      prev_success=ERR_FOUND)
        else:
            return print_errors(checked_conf_files)
    else:
        success = print_errors(checked_conf_files)

    return success
Example #4
0
def check_installation(interactive, err_codes=True, prev_success=NO_ERROR):
    print("CHECKING INSTALLATION...")
    # keep track of if all tests have been successful
    success = prev_success

    if (interactive and err_codes):
        if (ask_install_error_codes() == USER_EXIT):
            return USER_EXIT

    # check OS
    print("Checking if running a supported OS version...")
    checked_os = check_os()
    if (is_error(checked_os)):
        return print_errors(checked_os)
    else:
        success = print_errors(checked_os)

    # check space available
    print("Checking if enough disk space is available...")
    checked_space = check_space()
    if (is_error(checked_space)):
        return print_errors(checked_space)
    else:
        success = print_errors(checked_space)

    # check package manager
    print("Checking if machine has a supported package manager...")
    checked_pkg_manager = update_pkg_manager()
    if (is_error(checked_pkg_manager)):
        return print_errors(checked_pkg_manager)
    else:
        success = print_errors(checked_pkg_manager)

    # check packages are installed
    print("Checking if packages installed correctly...")
    checked_packages = check_packages()
    if (is_error(checked_packages)):
        return print_errors(checked_packages)
    else:
        success = print_errors(checked_packages)

    # check OMS version
    print("Checking if running a supported version of OMS...")
    checked_oms = check_oms(interactive)
    if (is_error(checked_oms)):
        return print_errors(checked_oms)
    else:
        success = print_errors(checked_oms)

    # check all files
    if (os.path.isdir(DFS_PATH)):
        print(
            "Checking if all files installed correctly (may take some time)..."
        )
        checked_files = check_filesystem(DFS_PATH)
        if (is_error(checked_files)):
            return print_errors(checked_files)
        else:
            success = print_errors(checked_files)
    else:
        print(
            "WARNING (INTERNAL): Datafiles have not been successfully copied over."
        )
        print("Skipping all files installed correctly check...")

    # check certs
    print("Checking certificate and RSA key are correct...")
    # check cert
    checked_cert = check_cert()
    if (checked_cert != NO_ERROR):
        success = print_errors(checked_cert)
    # check key
    checked_key = check_key()
    if (checked_key != NO_ERROR):
        success = print_errors(checked_key)
    # return if at least one is false
    if (is_error(checked_cert) or is_error(checked_key)):
        return ERR_FOUND

    return success
def check_custom_logs(interactive, prev_success=NO_ERROR):
    if (interactive):
        print(" To check if you are using custom logs, please go to https://ms.portal.azure.com\n"\
            " and navigate to your workspace. Once there, please navigate to the 'Advanced\n"\
            " settings' blade, and then go to 'Data' > 'Custom Logs'. There you should be\n"\
            " to see any custom logs you may have.\n")
        using_cl = get_input("Are you currently using custom logs? (y/n)",\
                            (lambda x : x.lower() in ['y','yes','n','no']),\
                            "Please type either 'y'/'yes' or 'n'/'no' to proceed.")
        # not using custom logs
        if (using_cl in ['n', 'no']):
            print("Continuing on with the rest of the troubleshooter...")
            print(
                "================================================================================"
            )
            return prev_success
        # using custom logs
        else:
            print("Continuing on with troubleshooter...")
            print(
                "--------------------------------------------------------------------------------"
            )

    print("CHECKING FOR CUSTOM LOG ISSUES...")

    success = prev_success

    # check if installed / connected / running correctly
    print("Checking if omsagent installed and running...")
    # check installation
    if (get_oms_version() == None):
        print_errors(ERR_OMS_INSTALL)
        print(
            "Running the installation part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_installation(interactive,
                                  err_codes=False,
                                  prev_success=ERR_FOUND)

    # check connection
    checked_la_endpts = check_log_analytics_endpts()
    if (checked_la_endpts != NO_ERROR):
        print_errors(checked_la_endpts)
        print(
            "Running the connection part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_connection(interactive,
                                err_codes=False,
                                prev_success=ERR_FOUND)

    # check running
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING
    checked_omsagent_running = check_omsagent_running(workspace_id)
    if (checked_omsagent_running != NO_ERROR):
        print_errors(checked_omsagent_running)
        print(
            "Running the general health part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_heartbeat(interactive, prev_success=ERR_FOUND)

    # check customlog.conf
    print("Checking for custom log configuration files...")
    checked_clconf = check_customlog_conf(interactive)
    if (is_error(checked_clconf)):
        return print_errors(checked_clconf)
    else:
        success = print_errors(checked_clconf)

    return success
Example #6
0
def check_heartbeat(interactive, prev_success=NO_ERROR):
    print("CHECKING HEARTBEAT / HEALTH...")

    success = prev_success

    # TODO: run `sh /opt/microsoft/omsagent/bin/omsadmin.sh -l` to check if onboarded and running

    # check if installed correctly
    print("Checking if installed correctly...")
    if (get_oms_version() == None):
        print_errors(ERR_OMS_INSTALL)
        print(
            "Running the installation part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_installation(interactive,
                                  err_codes=False,
                                  prev_success=ERR_FOUND)

    # get workspace ID
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_CONF_PATH))
        print_errors(ERR_INFO_MISSING)
        print(
            "Running the connection part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_connection(interactive,
                                err_codes=False,
                                prev_success=ERR_FOUND)

    # check if running multi-homing
    print("Checking if omsagent is trying to run multihoming...")
    checked_multihoming = check_multihoming(workspace_id)
    if (is_error(checked_multihoming)):
        return print_errors(checked_multihoming)
    else:
        success = print_errors(checked_multihoming)

    # TODO: check if other agents are sending heartbeats

    # check if omsagent is running
    print("Checking if omsagent is running...")
    checked_omsagent_running = check_omsagent_running(workspace_id)
    if (checked_omsagent_running == ERR_OMS_WONT_RUN):
        # try starting omsagent
        # TODO: find better way of doing this, check to see if agent is stopped / grab results
        checked_omsagent_running = start_omsagent(workspace_id)
    if (is_error(checked_omsagent_running)):
        return print_errors(checked_omsagent_running)
    else:
        success = print_errors(checked_omsagent_running)

    # check if omsagent.log finds any heartbeat errors
    print("Checking for errors in omsagent.log...")
    checked_log_hb = check_log_heartbeat(workspace_id)
    if (is_error(checked_log_hb)):
        # connection issue
        if (checked_log_hb == ERR_HEARTBEAT):
            print_errors(checked_log_hb)
            print(
                "Running the connection part of the troubleshooter in order to find the issue..."
            )
            print(
                "================================================================================"
            )
            return check_connection(err_codes=False, prev_success=ERR_FOUND)
        # other issue
        else:
            return print_errors(checked_log_hb)
    else:
        success = print_errors(checked_log_hb)

    return success
Example #7
0
def check_high_cpu_memory(interactive, prev_success=NO_ERROR):
    print("CHECKING FOR HIGH CPU / MEMORY USAGE...")

    success = prev_success

    # check if installed / connected / running correctly
    print("Checking if omsagent installed and running...")
    # check installation
    if (get_oms_version() == None):
        print_errors(ERR_OMS_INSTALL)
        print("Running the installation part of the troubleshooter in order to find the issue...")
        print("================================================================================")
        return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND)

    # check connection
    checked_la_endpts = check_log_analytics_endpts()
    if (checked_la_endpts != NO_ERROR):
        print_errors(checked_la_endpts)
        print("Running the connection part of the troubleshooter in order to find the issue...")
        print("================================================================================")
        return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND)

    # check running
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING
    checked_omsagent_running = check_omsagent_running(workspace_id)
    if (checked_omsagent_running != NO_ERROR):
        print_errors(checked_omsagent_running)
        print("Running the general health part of the troubleshooter in order to find the issue...")
        print("================================================================================")
        return check_heartbeat(interactive, prev_success=ERR_FOUND)

    # TODO: decide if should keep this in or not
    # check disk space
    # print("Checking recent modifications to largest files...")
    # checked_disk_space = check_disk_space()
    # if (checked_disk_space != NO_ERROR):
    #     return print_errors(checked_disk_space)

    # check log rotation
    print("Checking if log rotation is working correctly...")
    checked_logrot = check_log_rotation()
    if (is_error(checked_logrot)):
        return print_errors(checked_logrot)
    else:
        success = print_errors(checked_logrot)

    # check CPU capacity
    print("Checking if OMI is at 100% CPU (may take some time)...")
    checked_highcpu = check_omi_cpu()
    if (is_error(checked_highcpu)):
        return print_errors(checked_highcpu)
    else:
        success = print_errors(checked_highcpu)

    # check slab memory / dentry cache issue
    print("Checking slab memory / dentry cache usage...")
    checked_slabmem = check_slab_memory()
    if (is_error(checked_slabmem)):
        return print_errors(checked_slabmem)
    else:
        success = checked_slabmem

    return success