def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose, humanize_output=args.humanize) # Check to see if Marathon should be running here by checking for config try: marathon_config = marathon_tools.load_marathon_config() except MarathonNotConfigured: marathon_results = [('marathon is not configured to run here', True)] # Check to see if Chronos should be running here by checking for config try: chronos_config = load_chronos_config() except ChronosNotConfigured: chronos_results = [('chronos is not configured to run here', True)] if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except ServerNotFoundError as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) mesos_ok = all(status_for_results(mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose) print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not all([mesos_ok, marathon_ok, chronos_ok]): sys.exit(2) else: sys.exit(0)
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose, humanize_output=args.humanize) # Check to see if Marathon should be running here by checking for config try: marathon_config = marathon_tools.load_marathon_config() except MarathonNotConfigured: marathon_results = [('marathon is not configured to run here', True)] # Check to see if Chronos should be running here by checking for config try: chronos_config = load_chronos_config() except ChronosNotConfigured: chronos_results = [('chronos is not configured to run here', True)] if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except ServerNotFoundError as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) mesos_ok = all(status_for_results(mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose) print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not all([mesos_ok, marathon_ok, chronos_ok]): sys.exit(2) else: sys.exit(0)
def test_get_mesos_state_from_leader_raises_on_non_elected_leader(): # Non-elected leaders do not return 'elected_time' in their state # because they were not elected. un_elected_fake_state = { "activated_slaves": 3, "cluster": "test", "completed_frameworks": [], "deactivated_slaves": 0, "failed_tasks": 1, } mesos.cli.master.CURRENT.state = un_elected_fake_state with raises(mesos_tools.MasterNotAvailableException): assert mesos_tools.get_mesos_state_from_leader() == un_elected_fake_state
def test_get_mesos_state_from_leader_works_on_elected_leader(): # Elected leaders return 'elected_time' to indicate when # they were elected. good_fake_state = { "activated_slaves": 3, "cluster": "test", "completed_frameworks": [], "deactivated_slaves": 0, "elected_time": 1439503288.00787, "failed_tasks": 1, } mesos.cli.master.CURRENT.state = good_fake_state assert mesos_tools.get_mesos_state_from_leader() == good_fake_state
def test_get_mesos_state_from_leader_raises_on_non_elected_leader(): # Non-elected leaders do not return 'elected_time' in their state # because they were not elected. un_elected_fake_state = { "activated_slaves": 3, "cluster": "test", "completed_frameworks": [], "deactivated_slaves": 0, "failed_tasks": 1, } mesos.cli.master.CURRENT.state = un_elected_fake_state with raises(mesos_tools.MasterNotAvailableException): assert mesos_tools.get_mesos_state_from_leader() == un_elected_fake_state
def test_get_mesos_state_from_leader_works_on_elected_leader(): # Elected leaders return 'elected_time' to indicate when # they were elected. good_fake_state = { "activated_slaves": 3, "cluster": "test", "completed_frameworks": [], "deactivated_slaves": 0, "elected_time": 1439503288.00787, "failed_tasks": 1, } mesos.cli.master.CURRENT.state = good_fake_state assert mesos_tools.get_mesos_state_from_leader() == good_fake_state
def autoscale_local_cluster(): TARGET_UTILIZATION = 0.8 system_config = load_system_paasta_config() autoscaling_resources = system_config.get_cluster_autoscaling_resources() mesos_state = get_mesos_state_from_leader() for identifier, resource in autoscaling_resources.items(): resource_metrics_provider = get_cluster_metrics_provider(resource['type']) try: utilization = resource_metrics_provider(resource['id'], mesos_state, resource['pool']) log.debug("Utilization for %s: %f%%" % (identifier, utilization * 100)) error = utilization - TARGET_UTILIZATION resource_scaler = get_scaler(resource['type']) resource_scaler(resource, error) except ClusterAutoscalingError as e: log.error('%s: %s' % (identifier, e))
def get_mesos_status(): """Gathers information about the mesos cluster. :return: tuple of a string containing the status and a bool representing if it is ok or not """ state = get_mesos_state_from_leader() cluster_results = run_healthchecks_with_param(state, [assert_quorum_size, assert_no_duplicate_frameworks]) metrics = get_mesos_stats() metrics_results = run_healthchecks_with_param(metrics, [ assert_cpu_health, assert_memory_health, assert_slave_health, assert_tasks_running]) return cluster_results + metrics_results
def get_mesos_status(): """Gathers information about the mesos cluster. :return: tuple of a string containing the status and a bool representing if it is ok or not """ state = get_mesos_state_from_leader() cluster_results = run_healthchecks_with_param( state, [assert_quorum_size, assert_no_duplicate_frameworks]) metrics = get_mesos_stats() metrics_results = run_healthchecks_with_param(metrics, [ assert_cpu_health, assert_memory_health, assert_slave_health, assert_tasks_running ]) return cluster_results + metrics_results
def autoscale_local_cluster(): TARGET_UTILIZATION = 0.8 system_config = load_system_paasta_config() autoscaling_resources = system_config.get_cluster_autoscaling_resources() mesos_state = get_mesos_state_from_leader() for identifier, resource in autoscaling_resources.items(): resource_metrics_provider = get_cluster_metrics_provider( resource['type']) try: utilization = resource_metrics_provider(resource['id'], mesos_state, resource['pool']) print "Utilization for %s: %f%%" % (identifier, utilization * 100) error = utilization - TARGET_UTILIZATION resource_scaler = get_scaler(resource['type']) resource_scaler(resource, error) except ClusterAutoscalingError as e: print '%s: %s' % (identifier, e) # TODO: write to log
def autoscale_local_cluster(dry_run=False): if dry_run: log.info("Running in dry_run mode, no changes should be made") system_config = load_system_paasta_config() autoscaling_resources = system_config.get_cluster_autoscaling_resources() mesos_state = get_mesos_state_from_leader() for identifier, resource in autoscaling_resources.items(): log.info("Autoscaling {0}".format(identifier)) resource_metrics_provider = get_cluster_metrics_provider(resource['type']) try: current, target = resource_metrics_provider(resource['id'], mesos_state, resource) log.info("Target capacity: {0}, Capacity current: {1}".format(target, current)) resource_scaler = get_scaler(resource['type']) if target - current < 0: sorted_slaves = sort_slaves_to_kill(mesos_state, pool=resource['pool']) log.debug("Slaves by kill preference: {0}".format(sorted_slaves)) else: sorted_slaves = [] resource_scaler(resource, current, target, sorted_slaves, dry_run) except ClusterAutoscalingError as e: log.error('%s: %s' % (identifier, e))
def autoscale_local_cluster(dry_run=False): if dry_run: log.info("Running in dry_run mode, no changes should be made") system_config = load_system_paasta_config() autoscaling_resources = system_config.get_cluster_autoscaling_resources() all_pool_settings = system_config.get_resource_pool_settings() mesos_state = get_mesos_state_from_leader() for identifier, resource in autoscaling_resources.items(): pool_settings = all_pool_settings.get(resource['pool'], {}) log.info("Autoscaling {0}".format(identifier)) resource_metrics_provider = get_cluster_metrics_provider(resource['type']) try: current, target = resource_metrics_provider(resource['id'], mesos_state, resource, pool_settings) log.info("Target capacity: {0}, Capacity current: {1}".format(target, current)) resource_scaler = get_scaler(resource['type']) if target - current < 0: sorted_slaves = sort_slaves_to_kill(mesos_state, pool=resource['pool']) log.debug("Slaves by kill preference: {0}".format(sorted_slaves)) else: sorted_slaves = [] resource_scaler(resource, current, target, sorted_slaves, pool_settings, dry_run) except ClusterAutoscalingError as e: log.error('%s: %s' % (identifier, e))
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = get_mesos_state_status( mesos_state=mesos_state, ) metrics = get_mesos_stats() mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) else: marathon_results = [HealthCheckResult(message='Marathon is not configured to run here', healthy=True)] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except (ServerNotFoundError, socket_error) as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [HealthCheckResult(message='Chronos is not configured to run here', healthy=True)] mesos_ok = all(status_for_results(all_mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False if args.verbose == 0: print mesos_summary print marathon_summary print chronos_summary elif args.verbose == 1: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) else: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state) all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state) all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append(get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) print "Master paasta_tools version: {0}".format(__version__) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = get_mesos_state_status( mesos_state=mesos_state, ) metrics = get_mesos_stats() mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) else: marathon_results = [HealthCheckResult(message='Marathon is not configured to run here', healthy=True)] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except (ServerNotFoundError, socket_error) as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [HealthCheckResult(message='Chronos is not configured to run here', healthy=True)] mesos_ok = all(status_for_results(all_mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False if args.verbose == 0: print mesos_summary print marathon_summary print chronos_summary elif args.verbose == 1: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) elif args.verbose == 2: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state) all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) else: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state) all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) print_with_indent('Per Slave Utilization', 2) slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state) all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append(get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if not healthy_exit: sys.exit(2) else: sys.exit(0)