def create_complete_app(context): with contextlib.nested( mock.patch('paasta_tools.marathon_tools.create_complete_config'), mock.patch('paasta_tools.marathon_tools.load_marathon_config', return_value=context.marathon_config), mock.patch('paasta_tools.marathon_tools.load_system_paasta_config', return_value=context.system_paasta_config), mock.patch('paasta_tools.bounce_lib.load_system_paasta_config', return_value=context.system_paasta_config), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', return_value=context.system_paasta_config), ) as ( mock_create_complete_config, _, _, _, mock_load_system_paasta_config, ): mock_create_complete_config.return_value = fake_service_config mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster) print marathon_tools.load_marathon_config() return_tuple = setup_marathon_job.setup_service( service=fake_service_name, instance=fake_instance_name, client=context.marathon_client, marathon_config=context.marathon_config, service_marathon_config=fake_service_marathon_config, soa_dir=None, ) assert return_tuple[0] == 0 assert 'deployed' in return_tuple[1]
def service_instance_status_error(context, error_code, job_id): marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) settings.cluster = load_system_paasta_config().get_cluster() settings.soa_dir = context.soa_dir (service, instance, _, __) = decompose_job_id(job_id) request = testing.DummyRequest() request.matchdict = {'service': service, 'instance': instance} response = None try: response = instance_status(request) except InstanceFailure as exc: print exc.msg assert exc.err == int(error_code) except: raise assert not response
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. log.info("Inspecting %s for autoscaling" % job_id) marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning("Skipping autoscaling run for services because the lock is held") pass
def main(): args = parse_args() soa_dir = args.soa_dir logging.basicConfig() if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) cluster = load_system_paasta_config().get_cluster() service_instances = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir) config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) for service, instance in service_instances: check_service_replication( client=client, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, )
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() mesos_tasks = get_running_tasks_from_active_frameworks('') for config in configs: try: autoscale_marathon_instance(config, marathon_tasks, mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e, level='event') except LockHeldException: pass
def paasta_sysdig(args): if not args.local: mesos_master = get_any_mesos_master(cluster=args.cluster) ssh_cmd = 'ssh -At -o LogLevel=QUIET {0} "sudo paasta {1} --local"'.format(mesos_master, ' '.join(sys.argv[1:])) return_code, output = _run(ssh_cmd) if return_code != 0: print output sys.exit(return_code) slave, command = output.split(':', 1) subprocess.call(shlex.split("ssh -tA {0} '{1}'".format(slave, command.strip()))) return status = get_status_for_instance(cluster=args.cluster, service=args.service, instance=args.instance) slave = pick_slave_from_status(status=status, host=args.host) marathon_config = load_marathon_config() marathon_url = marathon_config.get_url()[0] marathon_user = marathon_config.get_username() marathon_pass = marathon_config.get_password() mesos_url = get_mesos_master().host marathon_parsed_url = urlparse(marathon_url) marathon_creds_url = marathon_parsed_url._replace(netloc="{0}:{1}@{2}".format(marathon_user, marathon_pass, marathon_parsed_url.netloc)) print format_mesos_command(slave, status.marathon.app_id, mesos_url, marathon_creds_url.geturl())
def cleanup_apps(soa_dir): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) for app_id in running_app_ids: log.debug("Checking app id %s", app_id) try: service, instance, _, __ = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue if (service, instance) not in valid_services: delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None): """Performs a start/stop/restart/status/scale on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: bool if the output should be verbose or not :returns: A unix-style return code """ marathon_config = marathon_tools.load_marathon_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = marathon_tools.create_complete_config(service, instance, marathon_config, soa_dir=soa_dir)['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id return 1 normal_instance_count = job_config.get_instances() normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance) proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, soa_dir=soa_dir) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) if command == 'start': start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'stop': stop_marathon_job(service, instance, app_id, client, cluster) elif command == 'restart': restart_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'status': # Setting up transparent cache for http API calls requests_cache.install_cache('paasta_serviceinit', backend='memory') print status_desired_state(service, instance, client, job_config) print status_marathon_job(service, instance, app_id, normal_instance_count, client) tasks, out = status_marathon_job_verbose(service, instance, client) if verbose: print out print status_mesos_tasks(service, instance, normal_instance_count) if verbose: print status_mesos_tasks_verbose(app_id, get_short_task_id) if proxy_port is not None: print status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose, ) elif command == 'scale': scale_marathon_job(service, instance, app_id, delta, client, cluster) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def main(argv=None): args = parse_paasta_api_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) if args.soa_dir: settings.soa_dir = args.soa_dir # Exit on exceptions while loading settings settings.cluster = load_system_paasta_config().get_cluster() marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=30) server = WSGIServer(('', int(args.port)), make_app()) log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir)) try: server.serve_forever() except KeyboardInterrupt: sys.exit(0)
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None): """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :returns: A unix-style return code """ system_config = load_system_paasta_config() marathon_config = marathon_tools.load_marathon_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = job_config.format_marathon_app_dict()['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id return 1 normal_instance_count = job_config.get_instances() normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance, cluster) proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) if command == 'restart': restart_marathon_job(service, instance, app_id, client, cluster) elif command == 'status': print status_desired_state(service, instance, client, job_config) print status_marathon_job(service, instance, app_id, normal_instance_count, client) tasks, out = status_marathon_job_verbose(service, instance, client) if verbose > 0: print out print status_mesos_tasks(service, instance, normal_instance_count) if verbose > 0: tail_lines = calculate_tail_lines(verbose_level=verbose) print status_mesos_tasks_verbose( job_id=app_id, get_short_task_id=get_short_task_id, tail_lines=tail_lines, ) if proxy_port is not None: print status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose > 0, synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(), ) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose, humanize_output=args.humanize) # Check to see if Marathon should be running here by checking for config try: marathon_config = marathon_tools.load_marathon_config() except MarathonNotConfigured: marathon_results = [('marathon is not configured to run here', True)] # Check to see if Chronos should be running here by checking for config try: chronos_config = load_chronos_config() except ChronosNotConfigured: chronos_results = [('chronos is not configured to run here', True)] if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except ServerNotFoundError as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) mesos_ok = all(status_for_results(mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose) print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not all([mesos_ok, marathon_ok, chronos_ok]): sys.exit(2) else: sys.exit(0)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \ and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float(len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical("Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def main(): args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) for deployment in client.list_deployments(): delete_deployment_if_too_old( client=client, deployment=deployment, max_date=args.age, dry_run=args.dry_run, )
def service_instance_status(context, app_count, job_id): marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) settings.cluster = load_system_paasta_config().get_cluster() settings.soa_dir = context.soa_dir (service, instance, _, __) = decompose_job_id(job_id) request = testing.DummyRequest() request.matchdict = {'service': service, 'instance': instance} response = instance_status(request) assert response['app_count'] == int(app_count), response assert response['marathon']['running_instance_count'] == response['marathon']['expected_instance_count'], response
def setup_paasta_api(): # pyinotify is a better solution than turning off file caching completely service_configuration_lib.disable_yaml_cache() # Exit on exceptions while loading settings settings.cluster = load_system_paasta_config().get_cluster() marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=30)
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster if args.minimal: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) service_instances = get_service_instances_that_need_bouncing( marathon_client=marathon_client, soa_dir=soa_dir) else: instances = get_services_for_cluster(cluster=cluster, instance_type='marathon', soa_dir=soa_dir) service_instances = [] for name, instance in instances: service_instances.append(compose_job_id(name, instance)) print '\n'.join(service_instances) sys.exit(0)
def get_deployments(): marathon_config = load_marathon_config() marathon_client = get_marathon_client(marathon_config) deployments = marathon_client.list_deployments() return deployments
def get_main_marathon_config(): log.debug("Reading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Marathon config is: %s", marathon_config) return marathon_config
def main(): args = parse_args() full_appid = args.appname.lstrip('/') soa_dir = args.soa_dir marathon_config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) if not marathon_tools.is_app_id_running(app_id=full_appid, client=client): print("Couldn't find an app named {0}".format(full_appid)) sys.exit(1) service, instance, _, __ = (s.replace('--', '_') for s in decompose_job_id(full_appid)) complete_config = marathon_tools.create_complete_config(service, instance, marathon_config) cluster = load_system_paasta_config().get_cluster() service_instance_config = marathon_tools.load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) nerve_ns = service_instance_config.get_nerve_namespace() service_namespace_config = marathon_tools.load_service_namespace_config(service=service, namespace=nerve_ns) drain_method = drain_lib.get_drain_method( service_instance_config.get_drain_method(service_namespace_config), service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=service_instance_config.get_drain_method_params(service_namespace_config), ) bounce_func = bounce_lib.get_bounce_method_func('down') while marathon_tools.is_app_id_running(app_id=full_appid, client=client): app_to_kill = client.get_app(full_appid) old_app_live_tasks, old_app_draining_tasks = get_old_live_draining_tasks([app_to_kill], drain_method) do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=complete_config, new_app_running='', happy_new_tasks=[], old_app_live_tasks=old_app_live_tasks, old_app_draining_tasks=old_app_draining_tasks, serviceinstance="{0}.{1}".format(service, instance), bounce_method='down', service=service, cluster=cluster, instance=instance, marathon_jobid=full_appid, client=client, soa_dir=soa_dir, ) print "Sleeping for 10 seconds to give the tasks time to drain" time.sleep(10) print("Sucessfully killed {0}".format(full_appid))
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print (PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = get_mesos_state_status(mesos_state=mesos_state) metrics = get_mesos_stats() mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status # Check to see if Marathon should be running here by checking for config try: marathon_config = marathon_tools.load_marathon_config() except MarathonNotConfigured: marathon_results = [HealthCheckResult(message="Marathon is not configured to run here", healthy=True)] # Check to see if Chronos should be running here by checking for config try: chronos_config = load_chronos_config() except PaastaNotConfiguredError: chronos_results = [HealthCheckResult(message="Chronos is not configured to run here", healthy=True)] if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print (PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except ServerNotFoundError as e: print (PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) mesos_ok = all(status_for_results(all_mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False if args.verbose == 0: print mesos_summary print marathon_summary print chronos_summary elif args.verbose == 1: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) elif args.verbose == 2: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) for grouping in args.groupings: print_with_indent("Resources Grouped by %s" % grouping, 2) resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state) all_rows = [[grouping.capitalize(), "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict["total"], free=resource_info_dict["free"] ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append( get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize) ) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) print marathon_summary print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose) print chronos_summary print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose) else: print mesos_summary print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose) for grouping in args.groupings: print_with_indent("Resources Grouped by %s" % grouping, 2) resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state) all_rows = [[grouping.capitalize(), "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict["total"], free=resource_info_dict["free"] ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append( get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize) ) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) print_with_indent("Per Slave Utilization", 2) slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave["hostname"], mesos_state) all_rows = [["Hostname", "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict["total"], free=resource_info_dict["free"] ) healthcheck_utilization_pairs = [ healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append( get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize) ) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(): marathon_config = None chronos_config = None args = parse_args() master = get_mesos_master() try: mesos_state = master.state except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = metastatus_lib.get_mesos_state_status( mesos_state=mesos_state, ) metrics = master.metrics_snapshot() mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics, mesos_state=mesos_state) framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = metastatus_lib.get_marathon_client(marathon_config) try: marathon_results = metastatus_lib.get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) else: marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here', healthy=True)] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here', healthy=True)] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False print "Master paasta_tools version: {0}".format(__version__) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function, mesos_state) all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state) all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def get_marathon_client_from_config(): marathon_config = load_marathon_config() marathon_client = get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) return marathon_client
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None): """Performs a start/stop/restart/status/scale on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :returns: A unix-style return code """ system_config = load_system_paasta_config() marathon_config = marathon_tools.load_marathon_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = job_config.format_marathon_app_dict()['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id return 1 normal_instance_count = job_config.get_instances() normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace( service, instance, cluster) proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) if command == 'start': start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'stop': stop_marathon_job(service, instance, app_id, client, cluster) elif command == 'restart': restart_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'status': print status_desired_state(service, instance, client, job_config) print status_marathon_job(service, instance, app_id, normal_instance_count, client) tasks, out = status_marathon_job_verbose(service, instance, client) if verbose > 0: print out print status_mesos_tasks(service, instance, normal_instance_count) if verbose > 0: tail_stdstreams = verbose > 1 print status_mesos_tasks_verbose(app_id, get_short_task_id, tail_stdstreams) if proxy_port is not None: print status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose > 0, synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config. get_synapse_haproxy_url_format(), ) elif command == 'scale': scale_marathon_job(service, instance, app_id, delta, client, cluster) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def get_main_marathon_config(): log.debug("Reading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Marathon config is: %s", marathon_config) return marathon_config