def check_thresholds(percent):
    """Gets the current state of the mesos cluster and compares it
    to a given percentage. If either the ram or CPU utilization is over that
    percentage, the sensu event will be sent with a status code of 2."""
    stats = get_mesos_stats()
    over_threshold = False
    output = ""
    current_mem = stats['master/mem_percent']
    current_cpu = stats['master/cpus_percent']
    percent = int(percent)
    cpu_print_tuple = (percent, current_cpu)
    mem_print_tuple = (percent, current_mem)
    if current_mem >= percent:
        output += "CRITICAL: Memory usage is over %d%%! Currently at %f%%!\n" % mem_print_tuple
        over_threshold = True
    else:
        output += "OK: Memory usage is under %d%%. (Currently at %f%%)\n" % mem_print_tuple
    if current_cpu >= percent:
        output += "CRITICAL: CPU usage is over %d%%! Currently at %f%%!\n" % cpu_print_tuple
        over_threshold = True
    else:
        output += "OK: CPU usage is under %d%%. (Currently at %f%%)\n" % cpu_print_tuple
    if over_threshold is True:
        status = 2
    else:
        status = 0
    send_event(status, output)
    return output
def check_thresholds(percent):
    """Gets the current state of the mesos cluster and compares it
    to a given percentage. If either the ram or CPU utilization is over that
    percentage, the sensu event will be sent with a status code of 2."""
    stats = get_mesos_stats()
    over_threshold = False
    output = ""
    current_mem = stats['master/mem_percent']
    current_cpu = stats['master/cpus_percent']
    percent = int(percent)
    cpu_print_tuple = (percent, current_cpu)
    mem_print_tuple = (percent, current_mem)
    if current_mem >= percent:
        output += "CRITICAL: Memory usage is over %d%%! Currently at %f%%!\n" % mem_print_tuple
        over_threshold = True
    else:
        output += "OK: Memory usage is under %d%%. (Currently at %f%%)\n" % mem_print_tuple
    if current_cpu >= percent:
        output += "CRITICAL: CPU usage is over %d%%! Currently at %f%%!\n" % cpu_print_tuple
        over_threshold = True
    else:
        output += "OK: CPU usage is under %d%%. (Currently at %f%%)\n" % cpu_print_tuple
    if over_threshold is True:
        status = 2
    else:
        status = 0
    send_event(status, output)
    return output
Example #3
0
def get_mesos_status(mesos_state, verbosity):
    """Gathers information about the mesos cluster.
       :return: tuple of a string containing the status and a bool representing if it is ok or not
    """

    cluster_results = run_healthchecks_with_param(
        mesos_state, [assert_quorum_size, assert_no_duplicate_frameworks])

    metrics = get_mesos_stats()
    metrics_results = run_healthchecks_with_param(metrics, [
        assert_cpu_health,
        assert_memory_health,
        assert_disk_health,
        assert_tasks_running,
        assert_slave_health,
    ])

    if verbosity == 2:
        metrics_results.extend(
            run_healthchecks_with_param(mesos_state,
                                        [assert_extra_attribute_data]))
    elif verbosity >= 3:
        metrics_results.extend(
            run_healthchecks_with_param(mesos_state,
                                        [assert_extra_slave_data]))

    return cluster_results + metrics_results
Example #4
0
def get_mesos_status():
    """Gathers information about the mesos cluster.
       :return: tuple of a string containing the status and a bool representing if it is ok or not
    """

    state = get_mesos_state_from_leader()
    cluster_results = run_healthchecks_with_param(state, [assert_quorum_size, assert_no_duplicate_frameworks])

    metrics = get_mesos_stats()
    metrics_results = run_healthchecks_with_param(metrics, [
        assert_cpu_health,
        assert_memory_health,
        assert_slave_health,
        assert_tasks_running])

    return cluster_results + metrics_results
Example #5
0
def get_mesos_status():
    """Gathers information about the mesos cluster.
       :return: tuple of a string containing the status and a bool representing if it is ok or not
    """

    state = get_mesos_state_from_leader()
    cluster_results = run_healthchecks_with_param(
        state, [assert_quorum_size, assert_no_duplicate_frameworks])

    metrics = get_mesos_stats()
    metrics_results = run_healthchecks_with_param(metrics, [
        assert_cpu_health, assert_memory_health, assert_slave_health,
        assert_tasks_running
    ])

    return cluster_results + metrics_results
Example #6
0
def get_mesos_status(mesos_state, verbosity):
    """Gathers information about the mesos cluster.
       :return: tuple of a string containing the status and a bool representing if it is ok or not
    """

    cluster_results = run_healthchecks_with_param(mesos_state, [assert_quorum_size, assert_no_duplicate_frameworks])

    metrics = get_mesos_stats()
    metrics_results = run_healthchecks_with_param(metrics, [
        assert_cpu_health,
        assert_memory_health,
        assert_tasks_running,
        assert_slave_health,
    ])

    if verbosity >= 2:
        metrics_results.extend(run_healthchecks_with_param(mesos_state, [assert_extra_slave_data]))

    return cluster_results + metrics_results
Example #7
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = get_mesos_state_status(
        mesos_state=mesos_state,
    )
    metrics = get_mesos_stats()
    mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [HealthCheckResult(message='Marathon is not configured to run here', healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except (ServerNotFoundError, socket_error) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [HealthCheckResult(message='Chronos is not configured to run here', healthy=True)]

    mesos_ok = all(status_for_results(all_mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    if args.verbose == 0:
        print mesos_summary
        print marathon_summary
        print chronos_summary
    elif args.verbose == 1:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    else:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state)
            all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
        print "Master paasta_tools version: {0}".format(__version__)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Example #8
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = get_mesos_state_status(
        mesos_state=mesos_state,
    )
    metrics = get_mesos_stats()
    mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [HealthCheckResult(message='Marathon is not configured to run here', healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except (ServerNotFoundError, socket_error) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [HealthCheckResult(message='Chronos is not configured to run here', healthy=True)]

    mesos_ok = all(status_for_results(all_mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    if args.verbose == 0:
        print mesos_summary
        print marathon_summary
        print chronos_summary
    elif args.verbose == 1:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    elif args.verbose == 2:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    else:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        print_with_indent('Per Slave Utilization', 2)
        slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state)
        all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

        # print info about slaves here. Note that we don't make modifications to
        # the healthy_exit variable here, because we don't care about a single slave
        # having high usage.
        for attribute_value, resource_info_dict in slave_resource_dict.items():
            table_rows = []
            resource_utilizations = resource_utillizations_from_resource_info(
                total=resource_info_dict['total'],
                free=resource_info_dict['free'],
            )
            healthcheck_utilization_pairs = [
                healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                for utilization in resource_utilizations
            ]
            table_rows.append(get_table_rows_for_resource_info_dict(
                attribute_value,
                healthcheck_utilization_pairs,
                args.humanize
            ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)