def setup(self): self.marathon_config = marathon_tools.load_marathon_config() self.marathon_client = marathon_tools.get_marathon_client( self.marathon_config.get_url(), self.marathon_config.get_username(), self.marathon_config.get_password(), )
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() mesos_tasks = get_running_tasks_from_active_frameworks('') for config in configs: try: autoscale_marathon_instance(config, marathon_tasks, mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e, level='event') except LockHeldException: pass
def cleanup_apps(soa_dir): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) for app_id in running_app_ids: log.debug("Checking app id %s", app_id) try: service, instance, _, __ = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue if (service, instance) not in valid_services: delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster if args.minimal: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) service_instances = get_service_instances_that_need_bouncing( marathon_client=marathon_client, soa_dir=soa_dir, ) else: instances = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) service_instances = [] for name, instance in instances: service_instances.append(compose_job_id(name, instance)) paasta_print('\n'.join(service_instances)) sys.exit(0)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. log.info("Inspecting %s for autoscaling" % job_id) marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning("Skipping autoscaling run for services because the lock is held") pass
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None): """Performs a start/stop/restart/status/scale on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: bool if the output should be verbose or not :returns: A unix-style return code """ marathon_config = marathon_tools.load_marathon_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = marathon_tools.create_complete_config(service, instance, marathon_config, soa_dir=soa_dir)['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id return 1 normal_instance_count = job_config.get_instances() normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance) proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, soa_dir=soa_dir) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) if command == 'start': start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'stop': stop_marathon_job(service, instance, app_id, client, cluster) elif command == 'restart': restart_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'status': # Setting up transparent cache for http API calls requests_cache.install_cache('paasta_serviceinit', backend='memory') print status_desired_state(service, instance, client, job_config) print status_marathon_job(service, instance, app_id, normal_instance_count, client) tasks, out = status_marathon_job_verbose(service, instance, client) if verbose: print out print status_mesos_tasks(service, instance, normal_instance_count) if verbose: print status_mesos_tasks_verbose(app_id, get_short_task_id) if proxy_port is not None: print status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose, ) elif command == 'scale': scale_marathon_job(service, instance, app_id, delta, client, cluster) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances( ) and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() all_marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks( '') # empty string matches all app ids with ZookeeperPool(): for config in configs: if config.get_autoscaling_params( )['decision_policy'] != 'bespoke': try: job_id = format_job_id(config.service, config.instance) marathon_tasks = { task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and task.health_check_results } if not marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks" ) mesos_tasks = [ task for task in all_mesos_tasks if task['id'] in marathon_tasks ] autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: raise e write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def main(): args = parse_args() soa_dir = args.soa_dir logging.basicConfig() if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) cluster = load_system_paasta_config().get_cluster() service_instances = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir) config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) for service, instance in service_instances: check_service_replication( client=client, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, )
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) all_marathon_tasks, all_mesos_tasks = get_all_marathon_mesos_tasks(marathon_client) if configs: with ZookeeperPool(): for config in configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( marathon_client, all_marathon_tasks, all_mesos_tasks, config, ) autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning("Skipping autoscaling run for services because the lock is held")
def main(argv=None): args = parse_paasta_api_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) if args.soa_dir: settings.soa_dir = args.soa_dir # Exit on exceptions while loading settings settings.cluster = load_system_paasta_config().get_cluster() marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=30) server = WSGIServer(('', int(args.port)), make_app()) log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir)) try: server.serve_forever() except KeyboardInterrupt: sys.exit(0)
def main(argv=None): args = parse_paasta_api_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) if args.soa_dir: settings.soa_dir = args.soa_dir # Exit on exceptions while loading settings settings.cluster = load_system_paasta_config().get_cluster() marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=30) server = WSGIServer(('', int(args.port)), make_app()) log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir)) try: server.serve_forever() except KeyboardInterrupt: sys.exit(0)
def cleanup_apps(soa_dir): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) for app_id in running_app_ids: log.debug("Checking app id %s", app_id) try: service, instance, _, __ = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue if (service, instance) not in valid_services: delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None): """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :returns: A unix-style return code """ system_config = load_system_paasta_config() marathon_config = marathon_tools.load_marathon_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = job_config.format_marathon_app_dict()['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id return 1 normal_instance_count = job_config.get_instances() normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance, cluster) proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) if command == 'restart': restart_marathon_job(service, instance, app_id, client, cluster) elif command == 'status': print status_desired_state(service, instance, client, job_config) print status_marathon_job(service, instance, app_id, normal_instance_count, client) tasks, out = status_marathon_job_verbose(service, instance, client) if verbose > 0: print out print status_mesos_tasks(service, instance, normal_instance_count) if verbose > 0: tail_lines = calculate_tail_lines(verbose_level=verbose) print status_mesos_tasks_verbose( job_id=app_id, get_short_task_id=get_short_task_id, tail_lines=tail_lines, ) if proxy_port is not None: print status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose > 0, synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(), ) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def main(): args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() service_instances = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir, ) config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) all_tasks = client.list_tasks() mesos_slaves = get_slaves() smartstack_replication_checker = SmartstackReplicationChecker( mesos_slaves, system_paasta_config) for service, instance in service_instances: check_service_replication( service=service, instance=instance, cluster=cluster, all_tasks=all_tasks, soa_dir=args.soa_dir, smartstack_replication_checker=smartstack_replication_checker, )
def service_instance_status_error(context, error_code, job_id): marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) settings.cluster = load_system_paasta_config().get_cluster() settings.soa_dir = context.soa_dir (service, instance, _, __) = decompose_job_id(job_id) request = testing.DummyRequest() request.matchdict = {'service': service, 'instance': instance} response = None try: response = instance_status(request) except InstanceFailure as exc: print exc.msg assert exc.err == int(error_code) except: raise assert not response
def main(): """Attempt to set up the marathon service instance given. Exits 1 if the deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance) except InvalidJobNameError: log.error("Invalid service instance specified. Format is service%sinstance." % SPACER) sys.exit(1) marathon_config = get_main_marathon_config() client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) try: service_instance_config = marathon_tools.load_marathon_service_config( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug("No deployments found for %s in cluster %s. Skipping." % (args.service_instance, load_system_paasta_config().get_cluster())) sys.exit(0) except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s in cluster %s" % \ (args.service_instance, load_system_paasta_config().get_cluster()) log.error(error_msg) sys.exit(1) try: status, output = setup_service(service, instance, client, marathon_config, service_instance_config, soa_dir) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0) except (KeyError, TypeError, AttributeError, InvalidInstanceConfig): import traceback error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def setup_marathon_client(): system_paasta_config = setup_system_paasta_config() marathon_config = marathon_tools.MarathonConfig( system_paasta_config.get_marathon_config()) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) return (client, marathon_config, system_paasta_config)
def get_marathon_client_from_config(): marathon_config = load_marathon_config() marathon_client = get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), ) return marathon_client
def main(): """Attempt to set up the marathon service instance given. Exits 1 if the deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance) except InvalidJobNameError: log.error("Invalid service instance specified. Format is service%sinstance." % SPACER) sys.exit(1) marathon_config = get_main_marathon_config() client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) try: service_instance_config = marathon_tools.load_marathon_service_config( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug("No deployments found for %s in cluster %s. Skipping." % (args.service_instance, load_system_paasta_config().get_cluster())) sys.exit(0) except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s in cluster %s" % \ (args.service_instance, load_system_paasta_config().get_cluster()) log.error(error_msg) sys.exit(1) try: status, output = setup_service(service, instance, client, marathon_config, service_instance_config, soa_dir) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0) except (KeyError, TypeError, AttributeError, InvalidInstanceConfig): import traceback error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def get_marathon_client(marathon_config): """Given a MarathonConfig object, return a client. :param marathon_config: a MarathonConfig object :returns client: a marathon client """ return marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password())
def marathon(self): if self._marathon is None: marathon_config = marathon_tools.load_marathon_config() self._marathon = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), cached=self._cached) return self._marathon
def get_marathon_client(marathon_config): """Given a MarathonConfig object, return a client. :param marathon_config: a MarathonConfig object :returns client: a marathon client """ return marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() )
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), ) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float( len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical( "Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold, ) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def main(): """Attempt to set up a list of marathon service instances given. Exits 1 if any service.instance deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Do the following for each service.instance: - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) # Setting up transparent cache for http API calls requests_cache.install_cache("setup_marathon_jobs", backend="memory") marathon_config = get_main_marathon_config() client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), ) marathon_apps = marathon_tools.get_all_marathon_apps(client, embed_tasks=True) num_failed_deployments = 0 for service_instance in args.service_instance_list: try: service, instance, _, __ = decompose_job_id(service_instance) except InvalidJobNameError: log.error( "Invalid service instance specified. Format is service%sinstance." % SPACER) num_failed_deployments = num_failed_deployments + 1 else: if deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps)[0]: num_failed_deployments = num_failed_deployments + 1 requests_cache.uninstall_cache() log.debug("%d out of %d service.instances failed to deploy." % (num_failed_deployments, len(args.service_instance_list))) sys.exit(1 if num_failed_deployments else 0)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_all_running_tasks() with ZookeeperPool(): for config in configs: try: job_id = config.format_marathon_app_dict()['id'] # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy (unless they are "old" in which case we # assume that marathon has screwed up and stopped healthchecking but that # they are healthy log.info("Inspecting %s for autoscaling" % job_id) marathon_tasks = { task.id: task for task in all_marathon_tasks if task.id.startswith(job_id) and (is_task_healthy(task) or not marathon_client. get_app(task.app_id).health_checks or is_old_task_missing_healthchecks( task, marathon_client)) } if not marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks") mesos_tasks = [ task for task in all_mesos_tasks if task['id'] in marathon_tasks ] autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning( "Skipping autoscaling run for services because the lock is held")
def main(): args = parse_args() config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) for deployment in client.list_deployments(): delete_deployment_if_too_old( client=client, deployment=deployment, max_date=args.age, dry_run=args.dry_run, )
def test_instances_status_marathon( mock_get_actual_deployments, mock_validate_service_instance, mock_load_marathon_service_config, mock_get_matching_appids, mock_marathon_job_status, ): settings.cluster = 'fake_cluster' mock_get_actual_deployments.return_value = { 'fake_cluster.fake_instance': 'GIT_SHA', 'fake_cluster.fake_instance2': 'GIT_SHA', 'fake_cluster2.fake_instance': 'GIT_SHA', 'fake_cluster2.fake_instance2': 'GIT_SHA', } mock_validate_service_instance.return_value = 'marathon' mock_marathon_config = marathon_tools.MarathonConfig( { 'url': 'fake_url', 'user': '******', 'password': '******' }, ) settings.marathon_client = marathon_tools.get_marathon_client( mock_marathon_config.get_url(), mock_marathon_config.get_username(), mock_marathon_config.get_password(), ) mock_get_matching_appids.return_value = ['a', 'b'] mock_service_config = marathon_tools.MarathonServiceConfig( service='fake_service', cluster='fake_cluster', instance='fake_instance', config_dict={'bounce_method': 'fake_bounce'}, branch_dict={}, ) mock_load_marathon_service_config.return_value = mock_service_config mock_marathon_job_status.return_value = 'fake_marathon_status' request = testing.DummyRequest() request.swagger_data = { 'service': 'fake_service', 'instance': 'fake_instance' } response = instance.instance_status(request) assert response['marathon']['bounce_method'] == 'fake_bounce' assert response['marathon']['desired_state'] == 'start'
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \ and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float(len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical("Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def setup_marathon_client(): marathon_connection_string = _get_marathon_connection_string() zk_connection_string = _get_zookeeper_connection_string('mesos-testcluster') marathon_config = marathon_tools.MarathonConfig({ 'url': marathon_connection_string, 'user': None, 'password': None, }, '/some_fake_path_to_marathon.json') client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) system_paasta_config = utils.SystemPaastaConfig({ 'cluster': 'testcluster', 'docker_volumes': [], 'docker_registry': u'docker-dev.yelpcorp.com', 'zookeeper': zk_connection_string }, '/some_fake_path_to_config_dir/') return (client, marathon_config, system_paasta_config)
def main(): args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) for deployment in client.list_deployments(): delete_deployment_if_too_old( client=client, deployment=deployment, max_date=args.age, dry_run=args.dry_run, )
def main(): """Attempt to set up a list of marathon service instances given. Exits 1 if any service.instance deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Do the following for each service.instance: - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) # Setting up transparent cache for http API calls requests_cache.install_cache("setup_marathon_jobs", backend="memory") marathon_config = get_main_marathon_config() client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) marathon_apps = marathon_tools.get_all_marathon_apps(client, embed_failures=True) num_failed_deployments = 0 for service_instance in args.service_instance_list: try: service, instance, _, __ = decompose_job_id(service_instance) except InvalidJobNameError: log.error("Invalid service instance specified. Format is service%sinstance." % SPACER) num_failed_deployments = num_failed_deployments + 1 else: if deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps): num_failed_deployments = num_failed_deployments + 1 requests_cache.uninstall_cache() log.debug("%d out of %d service.instances failed to deploy." % (num_failed_deployments, len(args.service_instance_list))) sys.exit(1 if num_failed_deployments else 0)
def service_instance_status(context, app_count, job_id): marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) settings.cluster = load_system_paasta_config().get_cluster() settings.soa_dir = context.soa_dir (service, instance, _, __) = decompose_job_id(job_id) request = testing.DummyRequest() request.matchdict = {'service': service, 'instance': instance} response = instance_status(request) assert response['app_count'] == int(app_count), response assert response['marathon']['running_instance_count'] == response['marathon']['expected_instance_count'], response
def setup_paasta_api(): # pyinotify is a better solution than turning off file caching completely service_configuration_lib.disable_yaml_cache() # Exit on exceptions while loading settings settings.cluster = load_system_paasta_config().get_cluster() marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=30)
def setup_paasta_api(): # pyinotify is a better solution than turning off file caching completely service_configuration_lib.disable_yaml_cache() # Exit on exceptions while loading settings settings.cluster = load_system_paasta_config().get_cluster() marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=5)
def test_instances_status( mock_get_actual_deployments, mock_validate_service_instance, mock_load_marathon_service_config, mock_get_matching_appids, mock_marathon_job_status, ): settings.cluster = 'fake_cluster' mock_get_actual_deployments.return_value = {'fake_cluster.fake_instance': 'GIT_SHA', 'fake_cluster.fake_instance2': 'GIT_SHA', 'fake_cluster2.fake_instance': 'GIT_SHA', 'fake_cluster2.fake_instance2': 'GIT_SHA'} mock_validate_service_instance.return_value = 'marathon' mock_marathon_config = marathon_tools.MarathonConfig( {'url': 'fake_url', 'user': '******', 'password': '******'} ) settings.marathon_client = marathon_tools.get_marathon_client( mock_marathon_config.get_url(), mock_marathon_config.get_username(), mock_marathon_config.get_password() ) mock_get_matching_appids.return_value = ['a', 'b'] mock_service_config = marathon_tools.MarathonServiceConfig( service='fake_service', cluster='fake_cluster', instance='fake_instance', config_dict={'bounce_method': 'fake_bounce'}, branch_dict={}, ) mock_load_marathon_service_config.return_value = mock_service_config mock_marathon_job_status.return_value = 'fake_marathon_status' request = testing.DummyRequest() request.swagger_data = {'service': 'fake_service', 'instance': 'fake_instance'} response = instance.instance_status(request) assert response['marathon']['bounce_method'] == 'fake_bounce' assert response['marathon']['desired_state'] == 'start'
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster if args.minimal: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) service_instances = get_service_instances_that_need_bouncing( marathon_client=marathon_client, soa_dir=soa_dir) else: instances = get_services_for_cluster(cluster=cluster, instance_type='marathon', soa_dir=soa_dir) service_instances = [] for name, instance in instances: service_instances.append(compose_job_id(name, instance)) print '\n'.join(service_instances) sys.exit(0)
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None): """Performs a start/stop/restart/status/scale on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: bool if the output should be verbose or not :returns: A unix-style return code """ marathon_config = marathon_tools.load_marathon_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = marathon_tools.create_complete_config( service, instance, marathon_config, soa_dir=soa_dir)['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id return 1 normal_instance_count = job_config.get_instances() normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace( service, instance) proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, soa_dir=soa_dir) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) if command == 'start': start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'stop': stop_marathon_job(service, instance, app_id, client, cluster) elif command == 'restart': restart_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'status': # Setting up transparent cache for http API calls requests_cache.install_cache('paasta_serviceinit', backend='memory') print status_desired_state(service, instance, client, job_config) print status_marathon_job(service, instance, app_id, normal_instance_count, client) tasks, out = status_marathon_job_verbose(service, instance, client) if verbose: print out print status_mesos_tasks(service, instance, normal_instance_count) if verbose: print status_mesos_tasks_verbose(app_id, get_short_task_id) if proxy_port is not None: print status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose, ) elif command == 'scale': scale_marathon_job(service, instance, app_id, delta, client, cluster) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def main(): args = parse_args() full_appid = args.appname.lstrip('/') soa_dir = args.soa_dir marathon_config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) if not marathon_tools.is_app_id_running(app_id=full_appid, client=client): print("Couldn't find an app named {0}".format(full_appid)) sys.exit(1) service, instance, _, __ = (s.replace('--', '_') for s in decompose_job_id(full_appid)) complete_config = marathon_tools.create_complete_config(service, instance, marathon_config) cluster = load_system_paasta_config().get_cluster() service_instance_config = marathon_tools.load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) nerve_ns = service_instance_config.get_nerve_namespace() service_namespace_config = marathon_tools.load_service_namespace_config(service=service, namespace=nerve_ns) drain_method = drain_lib.get_drain_method( service_instance_config.get_drain_method(service_namespace_config), service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=service_instance_config.get_drain_method_params(service_namespace_config), ) bounce_func = bounce_lib.get_bounce_method_func('down') while marathon_tools.is_app_id_running(app_id=full_appid, client=client): app_to_kill = client.get_app(full_appid) old_app_live_tasks, old_app_draining_tasks = get_old_live_draining_tasks([app_to_kill], drain_method) do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=complete_config, new_app_running='', happy_new_tasks=[], old_app_live_tasks=old_app_live_tasks, old_app_draining_tasks=old_app_draining_tasks, serviceinstance="{0}.{1}".format(service, instance), bounce_method='down', service=service, cluster=cluster, instance=instance, marathon_jobid=full_appid, client=client, soa_dir=soa_dir, ) print "Sleeping for 10 seconds to give the tasks time to drain" time.sleep(10) print("Sucessfully killed {0}".format(full_appid))
def main(): args = parse_args() full_appid = args.appname.lstrip('/') soa_dir = args.soa_dir marathon_config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) if not marathon_tools.is_app_id_running(app_id=full_appid, client=client): print("Couldn't find an app named {0}".format(full_appid)) sys.exit(1) service, instance, _, __ = (s.replace('--', '_') for s in decompose_job_id(full_appid)) complete_config = marathon_tools.create_complete_config( service, instance, marathon_config) cluster = load_system_paasta_config().get_cluster() service_instance_config = marathon_tools.load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) nerve_ns = service_instance_config.get_nerve_namespace() service_namespace_config = marathon_tools.load_service_namespace_config( service=service, namespace=nerve_ns) drain_method = drain_lib.get_drain_method( service_instance_config.get_drain_method(service_namespace_config), service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=service_instance_config.get_drain_method_params( service_namespace_config), ) bounce_func = bounce_lib.get_bounce_method_func('down') while marathon_tools.is_app_id_running(app_id=full_appid, client=client): app_to_kill = client.get_app(full_appid) old_app_live_tasks, old_app_draining_tasks = get_old_live_draining_tasks( [app_to_kill], drain_method) do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=complete_config, new_app_running='', happy_new_tasks=[], old_app_live_tasks=old_app_live_tasks, old_app_draining_tasks=old_app_draining_tasks, serviceinstance="{0}.{1}".format(service, instance), bounce_method='down', service=service, cluster=cluster, instance=instance, marathon_jobid=full_appid, client=client, soa_dir=soa_dir, ) print "Sleeping for 10 seconds to give the tasks time to drain" time.sleep(10) print("Sucessfully killed {0}".format(full_appid))
def setup_marathon_client(): system_paasta_config = setup_system_paasta_config() marathon_config = marathon_tools.MarathonConfig(system_paasta_config.get_marathon_config()) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) return (client, marathon_config, system_paasta_config)