Exemple #1
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)
    mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose,
                                     humanize_output=args.humanize)

    # Check to see if Marathon should be running here by checking for config
    try:
        marathon_config = marathon_tools.load_marathon_config()
    except MarathonNotConfigured:
        marathon_results = [('marathon is not configured to run here', True)]

    # Check to see if Chronos should be running here by checking for config
    try:
        chronos_config = load_chronos_config()
    except ChronosNotConfigured:
        chronos_results = [('chronos is not configured to run here', True)]

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except ServerNotFoundError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)

    mesos_ok = all(status_for_results(mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose)
    print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not all([mesos_ok, marathon_ok, chronos_ok]):
        sys.exit(2)
    else:
        sys.exit(0)
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)
    mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose,
                                     humanize_output=args.humanize)

    # Check to see if Marathon should be running here by checking for config
    try:
        marathon_config = marathon_tools.load_marathon_config()
    except MarathonNotConfigured:
        marathon_results = [('marathon is not configured to run here', True)]

    # Check to see if Chronos should be running here by checking for config
    try:
        chronos_config = load_chronos_config()
    except ChronosNotConfigured:
        chronos_results = [('chronos is not configured to run here', True)]

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except ServerNotFoundError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)

    mesos_ok = all(status_for_results(mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose)
    print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not all([mesos_ok, marathon_ok, chronos_ok]):
        sys.exit(2)
    else:
        sys.exit(0)
def test_get_mesos_state_from_leader_raises_on_non_elected_leader():
    # Non-elected leaders do not return 'elected_time' in their state
    # because they were not elected.
    un_elected_fake_state = {
        "activated_slaves": 3,
        "cluster": "test",
        "completed_frameworks": [],
        "deactivated_slaves": 0,
        "failed_tasks": 1,
    }
    mesos.cli.master.CURRENT.state = un_elected_fake_state
    with raises(mesos_tools.MasterNotAvailableException):
        assert mesos_tools.get_mesos_state_from_leader() == un_elected_fake_state
def test_get_mesos_state_from_leader_works_on_elected_leader():
    # Elected leaders return 'elected_time' to indicate when
    # they were elected.
    good_fake_state = {
        "activated_slaves": 3,
        "cluster": "test",
        "completed_frameworks": [],
        "deactivated_slaves": 0,
        "elected_time": 1439503288.00787,
        "failed_tasks": 1,
    }
    mesos.cli.master.CURRENT.state = good_fake_state
    assert mesos_tools.get_mesos_state_from_leader() == good_fake_state
Exemple #5
0
def test_get_mesos_state_from_leader_raises_on_non_elected_leader():
    # Non-elected leaders do not return 'elected_time' in their state
    # because they were not elected.
    un_elected_fake_state = {
        "activated_slaves": 3,
        "cluster": "test",
        "completed_frameworks": [],
        "deactivated_slaves": 0,
        "failed_tasks": 1,
    }
    mesos.cli.master.CURRENT.state = un_elected_fake_state
    with raises(mesos_tools.MasterNotAvailableException):
        assert mesos_tools.get_mesos_state_from_leader() == un_elected_fake_state
Exemple #6
0
def test_get_mesos_state_from_leader_works_on_elected_leader():
    # Elected leaders return 'elected_time' to indicate when
    # they were elected.
    good_fake_state = {
        "activated_slaves": 3,
        "cluster": "test",
        "completed_frameworks": [],
        "deactivated_slaves": 0,
        "elected_time": 1439503288.00787,
        "failed_tasks": 1,
    }
    mesos.cli.master.CURRENT.state = good_fake_state
    assert mesos_tools.get_mesos_state_from_leader() == good_fake_state
Exemple #7
0
def autoscale_local_cluster():
    TARGET_UTILIZATION = 0.8

    system_config = load_system_paasta_config()
    autoscaling_resources = system_config.get_cluster_autoscaling_resources()
    mesos_state = get_mesos_state_from_leader()
    for identifier, resource in autoscaling_resources.items():
        resource_metrics_provider = get_cluster_metrics_provider(resource['type'])
        try:
            utilization = resource_metrics_provider(resource['id'], mesos_state, resource['pool'])
            log.debug("Utilization for %s: %f%%" % (identifier, utilization * 100))
            error = utilization - TARGET_UTILIZATION
            resource_scaler = get_scaler(resource['type'])
            resource_scaler(resource, error)
        except ClusterAutoscalingError as e:
            log.error('%s: %s' % (identifier, e))
Exemple #8
0
def get_mesos_status():
    """Gathers information about the mesos cluster.
       :return: tuple of a string containing the status and a bool representing if it is ok or not
    """

    state = get_mesos_state_from_leader()
    cluster_results = run_healthchecks_with_param(state, [assert_quorum_size, assert_no_duplicate_frameworks])

    metrics = get_mesos_stats()
    metrics_results = run_healthchecks_with_param(metrics, [
        assert_cpu_health,
        assert_memory_health,
        assert_slave_health,
        assert_tasks_running])

    return cluster_results + metrics_results
Exemple #9
0
def get_mesos_status():
    """Gathers information about the mesos cluster.
       :return: tuple of a string containing the status and a bool representing if it is ok or not
    """

    state = get_mesos_state_from_leader()
    cluster_results = run_healthchecks_with_param(
        state, [assert_quorum_size, assert_no_duplicate_frameworks])

    metrics = get_mesos_stats()
    metrics_results = run_healthchecks_with_param(metrics, [
        assert_cpu_health, assert_memory_health, assert_slave_health,
        assert_tasks_running
    ])

    return cluster_results + metrics_results
Exemple #10
0
def autoscale_local_cluster():
    TARGET_UTILIZATION = 0.8

    system_config = load_system_paasta_config()
    autoscaling_resources = system_config.get_cluster_autoscaling_resources()
    mesos_state = get_mesos_state_from_leader()
    for identifier, resource in autoscaling_resources.items():
        resource_metrics_provider = get_cluster_metrics_provider(
            resource['type'])
        try:
            utilization = resource_metrics_provider(resource['id'],
                                                    mesos_state,
                                                    resource['pool'])
            print "Utilization for %s: %f%%" % (identifier, utilization * 100)
            error = utilization - TARGET_UTILIZATION
            resource_scaler = get_scaler(resource['type'])
            resource_scaler(resource, error)
        except ClusterAutoscalingError as e:
            print '%s: %s' % (identifier, e)  # TODO: write to log
Exemple #11
0
def autoscale_local_cluster(dry_run=False):
    if dry_run:
        log.info("Running in dry_run mode, no changes should be made")
    system_config = load_system_paasta_config()
    autoscaling_resources = system_config.get_cluster_autoscaling_resources()
    mesos_state = get_mesos_state_from_leader()
    for identifier, resource in autoscaling_resources.items():
        log.info("Autoscaling {0}".format(identifier))
        resource_metrics_provider = get_cluster_metrics_provider(resource['type'])
        try:
            current, target = resource_metrics_provider(resource['id'], mesos_state, resource)
            log.info("Target capacity: {0}, Capacity current: {1}".format(target, current))
            resource_scaler = get_scaler(resource['type'])
            if target - current < 0:
                sorted_slaves = sort_slaves_to_kill(mesos_state, pool=resource['pool'])
                log.debug("Slaves by kill preference: {0}".format(sorted_slaves))
            else:
                sorted_slaves = []
            resource_scaler(resource, current, target, sorted_slaves, dry_run)
        except ClusterAutoscalingError as e:
            log.error('%s: %s' % (identifier, e))
Exemple #12
0
def autoscale_local_cluster(dry_run=False):
    if dry_run:
        log.info("Running in dry_run mode, no changes should be made")
    system_config = load_system_paasta_config()
    autoscaling_resources = system_config.get_cluster_autoscaling_resources()
    all_pool_settings = system_config.get_resource_pool_settings()
    mesos_state = get_mesos_state_from_leader()
    for identifier, resource in autoscaling_resources.items():
        pool_settings = all_pool_settings.get(resource['pool'], {})
        log.info("Autoscaling {0}".format(identifier))
        resource_metrics_provider = get_cluster_metrics_provider(resource['type'])
        try:
            current, target = resource_metrics_provider(resource['id'], mesos_state, resource, pool_settings)
            log.info("Target capacity: {0}, Capacity current: {1}".format(target, current))
            resource_scaler = get_scaler(resource['type'])
            if target - current < 0:
                sorted_slaves = sort_slaves_to_kill(mesos_state, pool=resource['pool'])
                log.debug("Slaves by kill preference: {0}".format(sorted_slaves))
            else:
                sorted_slaves = []
            resource_scaler(resource, current, target, sorted_slaves, pool_settings, dry_run)
        except ClusterAutoscalingError as e:
            log.error('%s: %s' % (identifier, e))
Exemple #13
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = get_mesos_state_status(
        mesos_state=mesos_state,
    )
    metrics = get_mesos_stats()
    mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [HealthCheckResult(message='Marathon is not configured to run here', healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except (ServerNotFoundError, socket_error) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [HealthCheckResult(message='Chronos is not configured to run here', healthy=True)]

    mesos_ok = all(status_for_results(all_mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    if args.verbose == 0:
        print mesos_summary
        print marathon_summary
        print chronos_summary
    elif args.verbose == 1:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    else:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state)
            all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
        print "Master paasta_tools version: {0}".format(__version__)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Exemple #14
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = get_mesos_state_status(
        mesos_state=mesos_state,
    )
    metrics = get_mesos_stats()
    mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [HealthCheckResult(message='Marathon is not configured to run here', healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except (ServerNotFoundError, socket_error) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [HealthCheckResult(message='Chronos is not configured to run here', healthy=True)]

    mesos_ok = all(status_for_results(all_mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    if args.verbose == 0:
        print mesos_summary
        print marathon_summary
        print chronos_summary
    elif args.verbose == 1:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    elif args.verbose == 2:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    else:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        print_with_indent('Per Slave Utilization', 2)
        slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state)
        all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

        # print info about slaves here. Note that we don't make modifications to
        # the healthy_exit variable here, because we don't care about a single slave
        # having high usage.
        for attribute_value, resource_info_dict in slave_resource_dict.items():
            table_rows = []
            resource_utilizations = resource_utillizations_from_resource_info(
                total=resource_info_dict['total'],
                free=resource_info_dict['free'],
            )
            healthcheck_utilization_pairs = [
                healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                for utilization in resource_utilizations
            ]
            table_rows.append(get_table_rows_for_resource_info_dict(
                attribute_value,
                healthcheck_utilization_pairs,
                args.humanize
            ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)