def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir) try: service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, chronos_job in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] try: chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except utils.NoDeploymentsAvailable: log.info("Skipping %s because no deployments are available" % service) continue sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, chronos_job=chronos_job, client=client, ) if sensu_status is not None: send_event(chronos_job_config, sensu_status, sensu_output) except (chronos.ChronosAPIError) as e: log.error("CRITICAL: Unable to contact Chronos! Error: %s" % e) sys.exit(2)
def main(args): config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() # get those jobs listed in configs configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir) service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] sensu_output, sensu_status = sensu_message_status_for_jobs(service, instance, job_state_pairs) monitoring_overrides = compose_monitoring_overrides_for_service( cluster=system_paasta_config.get_cluster(), service=service, instance=instance, soa_dir=args.soa_dir ) send_event_to_sensu( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=args.soa_dir, )
def main(args): config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() # get those jobs listed in configs configured_jobs = chronos_tools.get_chronos_jobs_for_cluster( soa_dir=args.soa_dir) service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] sensu_output, sensu_status = sensu_message_status_for_jobs( service, instance, job_state_pairs) monitoring_overrides = compose_monitoring_overrides_for_service( cluster=system_paasta_config.get_cluster(), service=service, instance=instance, soa_dir=args.soa_dir) send_event_to_sensu( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=args.soa_dir, )
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose, humanize_output=args.humanize) # Check to see if Marathon should be running here by checking for config try: marathon_config = marathon_tools.load_marathon_config() except MarathonNotConfigured: marathon_results = [('marathon is not configured to run here', True)] # Check to see if Chronos should be running here by checking for config try: chronos_config = load_chronos_config() except ChronosNotConfigured: chronos_results = [('chronos is not configured to run here', True)] if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except ServerNotFoundError as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) mesos_ok = all(status_for_results(mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose) print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not all([mesos_ok, marathon_ok, chronos_ok]): sys.exit(2) else: sys.exit(0)
def main(): args = parse_args() config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) # get_chronos_jobs_for_cluster returns (service, job) expected_service_jobs = chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir) # filter jobs not related to paasta # and decompose into (service, instance, tag) tuples paasta_jobs = filter_paasta_jobs(deployed_job_names(client)) running_service_jobs = [chronos_tools.decompose_job_id(job) for job in paasta_jobs] to_delete = jobs_to_delete(expected_service_jobs, running_service_jobs) # recompose the job ids again for deletion to_delete_job_ids = [chronos_tools.compose_job_id(*job) for job in to_delete] task_responses = cleanup_tasks(client, to_delete_job_ids) task_successes = [] task_failures = [] for response in task_responses: if isinstance(response[-1], Exception): task_failures.append(response) else: task_successes.append(response) job_responses = cleanup_jobs(client, to_delete_job_ids) job_successes = [] job_failures = [] for response in job_responses: if isinstance(response[-1], Exception): job_failures.append(response) else: job_successes.append(response) if len(to_delete) == 0: print 'No Chronos Jobs to remove' else: if len(task_successes) > 0: print format_list_output("Successfully Removed Tasks (if any were running) for:", [job[0] for job in task_successes]) # if there are any failures, print and exit appropriately if len(task_failures) > 0: print format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures]) if len(job_successes) > 0: print format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes]) # if there are any failures, print and exit appropriately if len(job_failures) > 0: print format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures]) if len(job_failures) > 0 or len(task_failures) > 0: sys.exit(1)
def chronos_instance_status(instance_status, service, instance, verbose): cstatus = {} chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=settings.cluster, soa_dir=settings.soa_dir, ) cstatus['desired_state'] = job_config.get_desired_state() job_type = chronos_tools.get_job_type(job_config.config_dict) if job_type == chronos_tools.JobType.Scheduled: schedule_type = 'schedule' schedule = job_config.get_schedule() epsilon = job_config.get_epsilon() time_zone = job_config.get_schedule_time_zone() if time_zone == 'null' or time_zone is None: time_zone = 'UTC' cstatus['schedule'] = {} cstatus['schedule']['schedule'] = schedule cstatus['schedule']['epsilon'] = epsilon cstatus['schedule']['time_zone'] = time_zone elif job_type == chronos_tools.JobType.Dependent: schedule_type = 'parents' parents = job_config.get_parents() cstatus['parents'] = parents else: schedule_type = 'unknown' cstatus['schedule_type'] = schedule_type cstatus['status'] = {} if verbose: running_task_count = len( select_tasks_by_id( a_sync.block(get_cached_list_of_running_tasks_from_frameworks), job_config.get_job_name(), ), ) cstatus['status']['mesos_state'] = 'running' if running_task_count else 'not_running' cstatus['status']['disabled_state'] = 'not_scheduled' if job_config.get_disabled() else 'scheduled' cstatus['status']['chronos_state'] = chronos_tools.get_chronos_status_for_job(client, service, instance) cstatus['command'] = job_config.get_cmd() last_time, last_status = chronos_tools.get_status_last_run(job_config.config_dict) if last_status == chronos_tools.LastRunState.Success: last_status = 'success' elif last_status == chronos_tools.LastRunState.Fail: last_status = 'fail' elif last_status == chronos_tools.LastRunState.NotRun: last_status = 'not_run' else: last_status = '' if last_status == 'not_run' or last_status == '': last_time = 'never' cstatus['last_status'] = {} cstatus['last_status']['result'] = last_status cstatus['last_status']['time'] = last_time return cstatus
def chronos_instance_status(service: str, instance: str, verbose: int) -> Dict[str, Any]: chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) return { "output": chronos_serviceinit.status_chronos_jobs(client, service, instance, settings.cluster, settings.soa_dir, verbose) }
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster( cluster, soa_dir=soa_dir) try: service_job_mapping = build_service_job_mapping( client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] try: chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except utils.NoDeploymentsAvailable: paasta_print( utils.PaastaColors.cyan( "Skipping %s because no deployments are available" % service)) continue sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, service=service, instance=instance, cluster=cluster, job_state_pairs=job_state_pairs) if sensu_status is not None: monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, ) except (chronos.ChronosAPIError) as e: paasta_print( utils.PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2)
def main(): args = parse_args() cluster = load_system_paasta_config().get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = load_system_paasta_config() chronos_job_config = chronos_tools.load_chronos_job_config( service, instance, system_paasta_config.get_cluster(), soa_dir=args.soa_dir) try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=args.soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) print error_msg raise e except chronos_tools.UnknownChronosJobError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) print error_msg raise e except chronos_tools.InvalidParentError as e: raise e # complete_job_config is a formatted version # of the job, so the command is fornatted in the context # of 'now' # replace it with the 'original' cmd so it can be # re rendered original_command = chronos_job_config.get_cmd() complete_job_config['command'] = original_command clone = clone_job( complete_job_config, datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S")) client.add(clone)
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir) try: service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] try: chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except utils.NoDeploymentsAvailable: paasta_print(utils.PaastaColors.cyan("Skipping %s because no deployments are available" % service)) continue sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, service=service, instance=instance, cluster=cluster, job_state_pairs=job_state_pairs ) if sensu_status is not None: monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir ) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, ) except (chronos.ChronosAPIError) as e: paasta_print(utils.PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2)
def main(): args = parse_args() cluster = load_system_paasta_config().get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = load_system_paasta_config() chronos_job_config = chronos_tools.load_chronos_job_config( service, instance, system_paasta_config.get_cluster(), soa_dir=args.soa_dir) try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=args.soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) print error_msg raise e except chronos_tools.UnknownChronosJobError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) print error_msg raise e except chronos_tools.InvalidParentError as e: raise e # complete_job_config is a formatted version # of the job, so the command is fornatted in the context # of 'now' # replace it with the 'original' cmd so it can be # re rendered original_command = chronos_job_config.get_cmd() complete_job_config['command'] = original_command clone = clone_job(complete_job_config, datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S")) client.add(clone)
def check_chronos_jobs(): config = load_chronos_config() if not config: paasta_print("UNKNOWN: Failed to load chronos config") sys.exit(3) client = get_chronos_client(config) try: result = assert_chronos_scheduled_jobs(client) except (ChronosAPIError) as e: paasta_print("CRITICAL: Unable to connect to Chronos: %s" % e.message) sys.exit(2) if result.healthy: paasta_print("OK: " + result.message) sys.exit(0) else: paasta_print("CRITICAL: " + result.message) sys.exit(2)
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir) try: service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, service=service, instance=instance, cluster=cluster, job_state_pairs=job_state_pairs ) monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir ) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, ) except (ServerNotFoundError, chronos.ChronosAPIError, socket_error) as e: print(utils.PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2)
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir) service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, service=service, instance=instance, cluster=cluster, job_state_pairs=job_state_pairs ) monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir ) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, )
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster( cluster, soa_dir=soa_dir) service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, service=service, instance=instance, cluster=cluster, job_state_pairs=job_state_pairs) monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, )
def main(): args = parse_args() system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) related_jobs = chronos_tools.get_related_jobs_configs(cluster, service, instance, soa_dir=args.soa_dir) if not related_jobs: error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format( args.service_instance, cluster, ) paasta_print(error_msg) raise NoDeploymentsAvailable if not args.run_all_related_jobs: # Strip all the configuration for the related services # those information will not be used by the rest of the flow related_jobs = { (service, instance): related_jobs[(service, instance)], } complete_job_configs = {} for (srv, inst) in related_jobs: try: complete_job_configs.update( { (srv, inst): chronos_tools.create_complete_config( service=srv, job_name=inst, soa_dir=args.soa_dir, ), }, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format( chronos_tools.compose_job_id(srv, inst), cluster, ) paasta_print(error_msg) raise e except NoConfigurationForServiceError as e: error_msg = ( "Could not read chronos configuration file for {} in cluster {}\nError was: {}" .format( chronos_tools.compose_job_id(srv, inst), cluster, str(e), )) paasta_print(error_msg) raise e except chronos_tools.InvalidParentError as e: raise e if not args.run_all_related_jobs: sorted_jobs = [(service, instance)] else: sorted_jobs = chronos_tools.topological_sort_related_jobs( cluster, service, instance, soa_dir=args.soa_dir) timestamp = datetime.datetime.utcnow().isoformat() chronos_to_add = [] for (service, instance) in sorted_jobs: # complete_job_config is a formatted version of the job, # so the command is formatted in the context of 'now' # replace it with the 'original' cmd so it can be re rendered chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=args.soa_dir, ) original_command = chronos_job_config.get_cmd() complete_job_config = complete_job_configs[(service, instance)] complete_job_config['command'] = original_command clone = clone_job( chronos_job=complete_job_config, timestamp=timestamp, force_disabled=args.force_disabled, ) # modify the command to run commands for a given date clone = modify_command_for_date( chronos_job=clone, date=datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S"), verbose=args.verbose, ) if not args.run_all_related_jobs and chronos_tools.get_job_type( clone) == chronos_tools.JobType.Dependent: # If the job is a dependent job and we want to re-run only the specific instance # remove the parents and update the schedule to start the job as soon as possible clone = set_default_schedule(remove_parents(clone)) chronos_to_add.append(clone) for job_to_add in chronos_to_add: client.add(job_to_add)
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = get_mesos_state_status( mesos_state=mesos_state, ) metrics = get_mesos_stats() mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) else: marathon_results = [HealthCheckResult(message='Marathon is not configured to run here', healthy=True)] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except (ServerNotFoundError, socket_error) as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [HealthCheckResult(message='Chronos is not configured to run here', healthy=True)] mesos_ok = all(status_for_results(all_mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False if args.verbose == 0: print mesos_summary print marathon_summary print chronos_summary elif args.verbose == 1: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) else: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state) all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state) all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append(get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) print "Master paasta_tools version: {0}".format(__version__) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def perform_command(command, service, instance, cluster, verbose, soa_dir): """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :returns: A unix-style return code """ chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) complete_job_config = chronos_tools.create_complete_config(service, instance, soa_dir=soa_dir) job_id = complete_job_config["name"] if command == "start": start_chronos_job( service=service, instance=instance, job_id=job_id, client=client, cluster=cluster, job_config=job_config, complete_job_config=complete_job_config, emergency=True, ) elif command == "stop": matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, include_temporary=True) stop_chronos_job(service, instance, client, cluster, matching_jobs, emergency=True) elif command == "restart": matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, ) restart_chronos_job( service=service, instance=instance, job_id=job_id, client=client, cluster=cluster, matching_jobs=matching_jobs, job_config=job_config, complete_job_config=complete_job_config, emergency=True, ) elif command == "status": # Verbose mode shows previous versions. matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, ) sorted_matching_jobs = chronos_tools.sort_jobs(matching_jobs) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) paasta_print( status_chronos_jobs(client, sorted_matching_jobs, job_config, verbose)) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() running_jobs = set(deployed_job_names(client)) expected_service_jobs = {chronos_tools.compose_job_id(*job) for job in chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)} all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs))) expired_tmp_jobs = set(filter_expired_tmp_jobs(client, all_tmp_jobs, cluster=cluster, soa_dir=soa_dir)) valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs task_responses = cleanup_tasks(client, to_delete) task_successes = [] task_failures = [] for response in task_responses: if isinstance(response[-1], Exception): task_failures.append(response) else: task_successes.append(response) job_responses = cleanup_jobs(client, to_delete) job_successes = [] job_failures = [] for response in job_responses: if isinstance(response[-1], Exception): job_failures.append(response) else: job_successes.append(response) try: (service, instance) = chronos_tools.decompose_job_id(response[0]) monitoring_tools.send_event( check_name=check_chronos_job_name(service, instance), service=service, overrides={}, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, output="This instance was removed and is no longer supposed to be scheduled.", ) except InvalidJobNameError: # If we deleted some bogus job with a bogus jobid that could not be parsed, # Just move on, no need to send any kind of paasta event. pass if len(to_delete) == 0: paasta_print('No Chronos Jobs to remove') else: if len(task_successes) > 0: paasta_print(format_list_output( "Successfully Removed Tasks (if any were running) for:", [job[0] for job in task_successes], )) # if there are any failures, print and exit appropriately if len(task_failures) > 0: paasta_print(format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures])) if len(job_successes) > 0: paasta_print(format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes])) # if there are any failures, print and exit appropriately if len(job_failures) > 0: paasta_print(format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures])) if len(job_failures) > 0 or len(task_failures) > 0: sys.exit(1)
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print (PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = get_mesos_state_status(mesos_state=mesos_state) metrics = get_mesos_stats() mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status # Check to see if Marathon should be running here by checking for config try: marathon_config = marathon_tools.load_marathon_config() except MarathonNotConfigured: marathon_results = [HealthCheckResult(message="Marathon is not configured to run here", healthy=True)] # Check to see if Chronos should be running here by checking for config try: chronos_config = load_chronos_config() except PaastaNotConfiguredError: chronos_results = [HealthCheckResult(message="Chronos is not configured to run here", healthy=True)] if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print (PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except ServerNotFoundError as e: print (PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) mesos_ok = all(status_for_results(all_mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False if args.verbose == 0: print mesos_summary print marathon_summary print chronos_summary elif args.verbose == 1: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) elif args.verbose == 2: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) for grouping in args.groupings: print_with_indent("Resources Grouped by %s" % grouping, 2) resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state) all_rows = [[grouping.capitalize(), "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict["total"], free=resource_info_dict["free"] ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append( get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize) ) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) else: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) for grouping in args.groupings: print_with_indent("Resources Grouped by %s" % grouping, 2) resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state) all_rows = [[grouping.capitalize(), "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict["total"], free=resource_info_dict["free"] ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append( get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize) ) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) print_with_indent("Per Slave Utilization", 2) slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave["hostname"], mesos_state) all_rows = [["Hostname", "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict["total"], free=resource_info_dict["free"] ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append( get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize) ) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if not healthy_exit: sys.exit(2) else: sys.exit(0)
#!/usr/bin/env python from __future__ import absolute_import from __future__ import unicode_literals from paasta_tools import chronos_tools if __name__ == '__main__': config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) jobs = [job['name'] for job in client.list()] for job in jobs: client.delete(job)
def main(argv=None): marathon_config = None chronos_config = None args = parse_args(argv) master = get_mesos_master() try: mesos_state = master.state except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = metastatus_lib.get_mesos_state_status( mesos_state=mesos_state, ) metrics = master.metrics_snapshot() mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health( mesos_metrics=metrics, mesos_state=mesos_state) framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status( metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = metastatus_lib.get_marathon_client(marathon_config) try: marathon_results = metastatus_lib.get_marathon_status( marathon_client) except (MarathonError, InternalServerError, ValueError) as e: # catch ValueError until marathon-python/pull/167 is merged and this is handled upstream paasta_print( PaastaColors.red( ("CRITICAL: Unable to contact Marathon cluster at {}!" "Is the cluster healthy?".format( marathon_config["url"])))) sys.exit(2) else: marathon_results = [ metastatus_lib.HealthCheckResult( message='Marathon is not configured to run here', healthy=True) ] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print( PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [ metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True) ] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check( "Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print("Master paasta_tools version: {}".format(__version__)) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state) all_rows = [[ grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)' ]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items( ): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib. healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append( metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize)) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = reduce(lambda x, y: x + [(y)], get_autoscaling_info_for_all_resources(), [headers]) for line in format_table(table): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping( lambda slave: slave['hostname'], mesos_state) all_rows = [[ 'Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)' ]] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items( ): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib. healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append( metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize)) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(): configure_log() args = parse_args() soa_dir = args.soa_dir if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance, spacer=chronos_tools.INTERNAL_SPACER) print 'service, instance' print service, instance except InvalidJobNameError: log.error("Invalid service instance '%s' specified. Format is service%sinstance." % (args.service_instance, SPACER)) sys.exit(1) client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config()) cluster = load_system_paasta_config().get_cluster() try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError): error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) send_event( service=service, instance=None, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except chronos_tools.UnknownChronosJobError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except chronos_tools.InvalidParentError: log.warn("Skipping %s.%s: Parent job could not be found" % (service, instance)) sys.exit(0) status, output = setup_job( service=service, instance=instance, cluster=cluster, complete_job_config=complete_job_config, client=client, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event( service=service, instance=instance, soa_dir=soa_dir, status=sensu_status, output=output, ) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def main(): configure_log() args = parse_args() soa_dir = args.soa_dir if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance, spacer=chronos_tools.INTERNAL_SPACER) print 'service, instance' print service, instance except InvalidJobNameError: log.error("Invalid service instance '%s' specified. Format is service%sinstance." % (args.service_instance, SPACER)) sys.exit(1) client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config()) cluster = load_system_paasta_config().get_cluster() try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError): error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except chronos_tools.UnknownChronosJobError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except chronos_tools.InvalidParentError: log.warn("Skipping %s.%s: Parent job could not be found" % (service, instance)) sys.exit(0) status, output = setup_job( service=service, instance=instance, cluster=cluster, complete_job_config=complete_job_config, client=client, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event( service=service, instance=instance, soa_dir=soa_dir, status=sensu_status, output=output, ) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def main(): configure_log() args = parse_args() soa_dir = args.soa_dir if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance) except InvalidJobNameError: log.error("Invalid service instance '%s' specified. Format is service%sinstance." % (args.service_instance, SPACER)) sys.exit(1) client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config()) cluster = load_system_paasta_config().get_cluster() try: chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except NoDeploymentsAvailable: error_msg = "No deployments found for %s in cluster %s" % (args.service_instance, cluster) send_event( service=service, instance=None, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) # exit 0 because the event was sent to the right team and this is not an issue with Paasta itself sys.exit(0) except chronos_tools.InvalidChronosConfigError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) log.error(error_msg) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) # exit 0 because the event was sent to the right team and this is not an issue with Paasta itself sys.exit(0) complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=soa_dir, ) status, output = setup_job( service=service, instance=instance, cluster=cluster, chronos_job_config=chronos_job_config, complete_job_config=complete_job_config, client=client, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event( service=service, instance=instance, soa_dir=soa_dir, status=sensu_status, output=output, ) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def main(argv: Optional[List[str]] = None) -> None: chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients( get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % '\n'.join(e.args))) sys.exit(2) # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print( PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [ metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, ) ] marathon_results = _run_marathon_checks(marathon_clients) mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check( "Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print(f"Master paasta_tools version: {__version__}") metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: print_with_indent( 'Resources Grouped by %s' % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state, ) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = [headers] + [[ str(x) for x in asi ] for asi in get_autoscaling_info_for_all_resources(mesos_state)] for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(argv=None): chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers)) try: mesos_state = master.state all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, marathon_clients=marathon_clients, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, )] marathon_results = _run_marathon_checks(marathon_clients) mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print("Master paasta_tools version: {}".format(__version__)) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state, ) all_rows = [[ grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)', 'GPU (used/total)', 'Agent count', ]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, ) + [str(resource_info_dict['slave_count'])]) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields] table = functools.reduce( lambda x, y: x + [(y)], get_autoscaling_info_for_all_resources(mesos_state), [headers], ) for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping( lambda slave: slave['hostname'], mesos_state, ) all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(): args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) try: service, instance, _, __ = decompose_job_id( args.service_instance, spacer=chronos_tools.INTERNAL_SPACER) except InvalidJobNameError: log.error( "Invalid service instance '%s' specified. Format is service%sinstance." % (args.service_instance, SPACER)) sys.exit(1) client = chronos_tools.get_chronos_client( chronos_tools.load_chronos_config()) cluster = load_system_paasta_config().get_cluster() try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError): error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format( args.service_instance, cluster, ) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except NoConfigurationForServiceError as e: error_msg = ( f"Could not read chronos configuration file for {args.service_instance} in cluster {cluster}\n" + "Error was: %s" % str(e)) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except NoSlavesAvailableError as e: error_msg = ( f"There are no PaaSTA slaves that can run {args.service_instance} in cluster {cluster}\n" + "Double check the cluster and the configured constraints/pool/whitelist.\n" "Error was: %s" % str(e)) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except chronos_tools.InvalidParentError: log.warn( f"Skipping {service}.{instance}: Parent job could not be found") sys.exit(0) modified_config = config_with_historical_stats( chronos_client=client, service=service, instance=instance, job_config=complete_job_config, ) status, output = setup_job( service=service, instance=instance, cluster=cluster, complete_job_config=modified_config, client=client, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event( service=service, instance=instance, soa_dir=soa_dir, status=sensu_status, output=output, ) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def print_output(argv: Optional[Sequence[str]] = None) -> None: mesos_available = is_mesos_available() kube_available = is_kubernetes_available() chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() if mesos_available: master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs["use_mesos_cache"] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients( get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks(mesos_master=master, mesos_state=mesos_state) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % "\n".join(e.args))) raise FatalError(2) marathon_results = _run_marathon_checks(marathon_clients) else: marathon_results = [ metastatus_lib.HealthCheckResult( message="Marathon is not configured to run here", healthy=True) ] all_mesos_results = [ metastatus_lib.HealthCheckResult( message="Mesos is not configured to run here", healthy=True) ] if kube_available: kube_client = KubeClient() kube_results = _run_kube_checks(kube_client) else: kube_results = [ metastatus_lib.HealthCheckResult( message="Kubernetes is not configured to run here", healthy=True) ] # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print( PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) raise FatalError(2) else: chronos_results = [ metastatus_lib.HealthCheckResult( message="Chronos is not configured to run here", healthy=True) ] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) kube_ok = all(metastatus_lib.status_for_results(kube_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) kube_summary = metastatus_lib.generate_summary_for_check( "Kubernetes", kube_ok) chronos_summary = metastatus_lib.generate_summary_for_check( "Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print(f"Master paasta_tools version: {__version__}") paasta_print("Mesos leader: %s" % get_mesos_leader()) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1 and mesos_available: print_with_indent( "Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = [headers] + [[ str(x) for x in asi ] for asi in get_autoscaling_info_for_all_resources(mesos_state)] for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent("Per Slave Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok, kube_results, args.verbose) if args.verbose > 1 and kube_available: print_with_indent( "Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_kube( groupings=args.groupings, threshold=args.threshold, kube_client=kube_client) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("No autoscaling resources for Kubernetes", 2) if args.verbose >= 3: print_with_indent("Per Node Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2) # print info about nodes here. Note that we don't make # modifications to the healthy_exit variable here, because we don't # care about a single node having high usage. all_rows, _ = utilization_table_by_grouping_from_kube( groupings=args.groupings + ["hostname"], threshold=args.threshold, kube_client=kube_client, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be # 1 for per-node resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: raise FatalError(2)
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) running_jobs = set(deployed_job_names(client)) expected_service_jobs = set([chronos_tools.compose_job_id(*job) for job in chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)]) all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs))) expired_tmp_jobs = set(filter_expired_tmp_jobs(client, all_tmp_jobs)) valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs task_responses = cleanup_tasks(client, to_delete) task_successes = [] task_failures = [] for response in task_responses: if isinstance(response[-1], Exception): task_failures.append(response) else: task_successes.append(response) job_responses = cleanup_jobs(client, to_delete) job_successes = [] job_failures = [] for response in job_responses: if isinstance(response[-1], Exception): job_failures.append(response) else: job_successes.append(response) try: (service, instance) = chronos_tools.decompose_job_id(response[0]) send_event( service=service, instance=instance, monitoring_overrides={}, soa_dir=soa_dir, status_code=pysensu_yelp.Status.OK, message="This instance was removed and is no longer supposed to be scheduled.", ) except InvalidJobNameError: # If we deleted some bogus job with a bogus jobid that could not be parsed, # Just move on, no need to send any kind of paasta event. pass if len(to_delete) == 0: print 'No Chronos Jobs to remove' else: if len(task_successes) > 0: print format_list_output("Successfully Removed Tasks (if any were running) for:", [job[0] for job in task_successes]) # if there are any failures, print and exit appropriately if len(task_failures) > 0: print format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures]) if len(job_successes) > 0: print format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes]) # if there are any failures, print and exit appropriately if len(job_failures) > 0: print format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures]) if len(job_failures) > 0 or len(task_failures) > 0: sys.exit(1)
def perform_command(command, service, instance, cluster, verbose, soa_dir): """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :returns: A unix-style return code """ chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) complete_job_config = chronos_tools.create_complete_config(service, instance, soa_dir=soa_dir) job_id = complete_job_config["name"] if command == "start": start_chronos_job( service=service, instance=instance, job_id=job_id, client=client, cluster=cluster, job_config=job_config, complete_job_config=complete_job_config, emergency=True, ) elif command == "stop": matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, include_temporary=True ) stop_chronos_job(service, instance, client, cluster, matching_jobs, emergency=True) elif command == "restart": matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, ) restart_chronos_job( service=service, instance=instance, job_id=job_id, client=client, cluster=cluster, matching_jobs=matching_jobs, job_config=job_config, complete_job_config=complete_job_config, emergency=True, ) elif command == "status": # Verbose mode shows previous versions. matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, ) sorted_matching_jobs = chronos_tools.sort_jobs(matching_jobs) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) paasta_print(status_chronos_jobs(client, sorted_matching_jobs, job_config, verbose)) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def main(): marathon_config = None chronos_config = None args = parse_args() master = get_mesos_master() try: mesos_state = master.state except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = metastatus_lib.get_mesos_state_status( mesos_state=mesos_state, ) metrics = master.metrics_snapshot() mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics, mesos_state=mesos_state) framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = metastatus_lib.get_marathon_client(marathon_config) try: marathon_results = metastatus_lib.get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) else: marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here', healthy=True)] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here', healthy=True)] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False print "Master paasta_tools version: {0}".format(__version__) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function, mesos_state) all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state) all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def chronos(self) -> ChronosClient: if self._chronos is None: chronos_config = chronos_tools.load_chronos_config() self._chronos = chronos_tools.get_chronos_client( chronos_config, cached=self._cached) return self._chronos