def autoscale_service_configs( service_configs: Sequence[MarathonServiceConfig], system_paasta_config: SystemPaastaConfig, ) -> None: if autoscaling_is_paused(): log.warning("Skipping autoscaling because autoscaler paused") return marathon_clients = get_marathon_clients( get_marathon_servers(system_paasta_config)) apps_with_clients = get_marathon_apps_with_clients( marathon_clients.get_all_clients(), embed_tasks=True) all_mesos_tasks = a_sync.block(get_all_running_tasks) with ZookeeperPool(): for config in service_configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, config, system_paasta_config, ) autoscale_marathon_instance( config, system_paasta_config, list(marathon_tasks.values()), mesos_tasks, ) except Exception as e: write_to_log(config=config, line="Caught Exception %s" % e, level="debug")
def get_at_risk_service_instances(self, draining_hosts) -> List[ServiceInstance]: marathon_apps_with_clients = get_marathon_apps_with_clients( clients=self.marathon_clients.get_all_clients(), embed_tasks=True, ) at_risk_tasks = [] for app, client in marathon_apps_with_clients: for task in app.tasks: if task.host in draining_hosts: at_risk_tasks.append(task) self.log.info(f"At risk tasks: {at_risk_tasks}") service_instances: List[ServiceInstance] = [] for task in at_risk_tasks: app_id = task.app_id.strip('/') service, instance, _, __ = deformat_job_id(app_id) # check we haven't already added this instance, # no need to add the same instance to the bounce queue # more than once if not any([(service, instance) == (si.service, si.instance) for si in service_instances]): # https://github.com/python/mypy/issues/2852 service_instances.append(ServiceInstance( # type: ignore service=service, instance=instance, cluster=self.config.get_cluster(), bounce_by=int(time.time()), watcher=type(self).__name__, bounce_timers=None, failures=0, )) return service_instances
def get_service_instances_that_need_bouncing(marathon_clients, soa_dir): desired_marathon_configs_formatted, desired_job_configs = get_desired_marathon_configs( soa_dir) desired_ids_and_clients = set() for app_id, job_config in desired_job_configs.items(): desired_ids_and_clients.add( (app_id, marathon_clients.get_current_client_for_service(job_config))) current_apps_with_clients = { (app.id.lstrip('/'), client): app for app, client in get_marathon_apps_with_clients( marathon_clients.get_all_clients()) } actual_ids_and_clients = set(current_apps_with_clients.keys()) undesired_apps_and_clients = actual_ids_and_clients.symmetric_difference( desired_ids_and_clients) apps_that_need_bouncing = { long_job_id_to_short_job_id(app_id) for app_id, client in undesired_apps_and_clients } draining_hosts = get_draining_hosts() for (app_id, client), app in current_apps_with_clients.items(): short_app_id = long_job_id_to_short_job_id(app_id) if short_app_id not in apps_that_need_bouncing: if (app.instances != desired_marathon_configs_formatted[app_id]['instances'] or get_num_at_risk_tasks(app, draining_hosts) != 0): apps_that_need_bouncing.add(short_app_id) return (app_id.replace('--', '_') for app_id in apps_that_need_bouncing)
def get_at_risk_service_instances( self, draining_hosts: List[str]) -> List[ServiceInstance]: marathon_apps_with_clients = get_marathon_apps_with_clients( clients=self.marathon_clients.get_all_clients(), embed_tasks=True) at_risk_tasks = [] for app, client in marathon_apps_with_clients: for task in app.tasks: if task.host in draining_hosts: at_risk_tasks.append(task) self.log.info(f"At risk tasks: {at_risk_tasks}") service_instances: List[ServiceInstance] = [] for task in at_risk_tasks: app_id = task.app_id.strip("/") service, instance, _, __ = deformat_job_id(app_id) # check we haven't already added this instance, # no need to add the same instance to the bounce queue # more than once if not any([(service, instance) == (si.service, si.instance) for si in service_instances]): service_instances.append( ServiceInstance( service=service, instance=instance, bounce_by=time.time(), wait_until=time.time(), watcher=type(self).__name__, failures=0, enqueue_time=time.time(), bounce_start_time=time.time(), )) return service_instances
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): if autoscaling_is_paused(): log.warning("Skipping autoscaling because autoscaler paused") return try: with create_autoscaling_lock(): system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) marathon_clients = get_marathon_clients( get_marathon_servers(system_paasta_config)) apps_with_clients = get_marathon_apps_with_clients( marathon_clients.get_all_clients(), embed_tasks=True) all_mesos_tasks = get_all_running_tasks() if configs: with ZookeeperPool(): for config in configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, config, ) autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning( "Skipping autoscaling run for services because the lock is held")
def status_marathon_job_verbose(service, instance, clients, cluster, soa_dir, job_config): """Returns detailed information about a marathon apps for a service and instance. Does not make assumptions about what the *exact* appid is, but instead does a fuzzy match on any marathon apps that match the given service.instance""" all_tasks = [] all_output = [] # For verbose mode, we want to see *any* matching app. As it may # not be the one that we think should be deployed. For example # during a bounce we want to see the old and new ones. relevant_clients = clients.get_all_clients_for_service(job_config) marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( relevant_clients, embed_tasks=True) for app, client in marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients): tasks, output = get_verbose_status_of_marathon_app( marathon_client=client, app=app, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) all_tasks.extend(tasks) all_output.append(output) return all_tasks, "\n".join(all_output)
def main() -> None: """Attempt to set up a list of marathon service instances given. Exits 1 if any service.instance deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Do the following for each service.instance: - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) # Setting up transparent cache for http API calls requests_cache.install_cache("setup_marathon_jobs", backend="memory") system_paasta_config = load_system_paasta_config() clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(system_paasta_config) ) unique_clients = clients.get_all_clients() marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( unique_clients, embed_tasks=True ) num_failed_deployments = 0 for service_instance in args.service_instance_list: try: service, instance, _, __ = decompose_job_id(service_instance) except InvalidJobNameError: log.error( "Invalid service instance specified. Format is service%sinstance." % SPACER ) num_failed_deployments = num_failed_deployments + 1 else: if deploy_marathon_service( service, instance, clients, soa_dir, marathon_apps_with_clients )[0]: num_failed_deployments = num_failed_deployments + 1 requests_cache.uninstall_cache() log.debug( "%d out of %d service.instances failed to deploy." % (num_failed_deployments, len(args.service_instance_list)) ) sys.exit(1 if num_failed_deployments else 0)
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") system_paasta_config = load_system_paasta_config() log.info("Connecting to marathon") clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(system_paasta_config)) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) all_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients.get_all_clients()) app_ids_with_clients = [] for (app, client) in all_apps_with_clients: try: app_id = marathon_tools.deformat_job_id(app.id.lstrip('/')) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app.id) continue app_ids_with_clients.append((app_id, client)) apps_to_kill = [((service, instance, git_sha, config_sha), client) for (service, instance, git_sha, config_sha), client in app_ids_with_clients if (service, instance) not in valid_services] log.debug("Running apps: %s" % app_ids_with_clients) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if app_ids_with_clients: above_kill_threshold = float(len(apps_to_kill)) / float( len(app_ids_with_clients)) > float(kill_threshold) if above_kill_threshold and not force: log.critical( "Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold, ) raise DontKillEverythingError for id_tuple, client in apps_to_kill: app_id = marathon_tools.format_job_id(*id_tuple) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def when_setup_service_initiated(context): with mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, # Wrap function call so we can select a subset of tasks or test # intermediate steps, like when an app is not completely up side_effect=lambda app, _, __, ___, **kwargs: get_happy_tasks( app, context.service, "fake_nerve_ns", context.system_paasta_config, )[:context.max_tasks], ), mock.patch( 'paasta_tools.bounce_lib.bounce_lock_zookeeper', autospec=True, ), mock.patch( 'paasta_tools.bounce_lib.time.sleep', autospec=True, ), mock.patch( 'paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True, ) as mock_load_system_paasta_config, mock.patch( 'paasta_tools.setup_marathon_job._log', autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.get_config_hash', autospec=True, return_value='confighash', ), mock.patch( 'paasta_tools.marathon_tools.get_code_sha_from_dockerurl', autospec=True, return_value='newapp', ), mock.patch( 'paasta_tools.utils.InstanceConfig.get_docker_url', autospec=True, return_value='busybox', ), mock.patch( 'paasta_tools.mesos_maintenance.get_principal', autospec=True, ) as mock_get_principal, mock.patch( 'paasta_tools.mesos_maintenance.get_secret', autospec=True, ) as mock_get_secret: credentials = mesos_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret') mock_get_principal.return_value = credentials.principal mock_get_secret.return_value = credentials.secret mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster) # 120 * 0.5 = 60 seconds for _ in range(120): try: marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=context.marathon_clients.get_all_clients(), embed_tasks=True, ) (code, message, bounce_again) = setup_marathon_job.setup_service( service=context.service, instance=context.instance, clients=context.marathon_clients, marathon_apps_with_clients=marathon_apps_with_clients, job_config=context.new_marathon_service_config, soa_dir='/nail/etc/services', ) assert code == 0, message return except MarathonHttpError: time.sleep(0.5) raise Exception("Unable to acquire app lock for setup_marathon_job.setup_service")
def marathon_instance_status( instance_status: Mapping[str, Any], service: str, instance: str, verbose: int, include_smartstack: bool, include_mesos: bool, ) -> Mapping[str, Any]: mstatus: Dict[str, Any] = {} job_config = marathon_tools.load_marathon_service_config( service, instance, settings.cluster, soa_dir=settings.soa_dir ) marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=settings.marathon_clients.get_all_clients_for_service(job_config), embed_tasks=True, service_name=service, ) matching_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients ) mstatus.update( marathon_job_status( service, instance, job_config, matching_apps_with_clients, verbose ) ) if include_smartstack: service_namespace_config = marathon_tools.load_service_namespace_config( service=service, namespace=job_config.get_nerve_namespace(), soa_dir=settings.soa_dir, ) if "proxy_port" in service_namespace_config: tasks = [ task for app, _ in matching_apps_with_clients for task in app.tasks ] mstatus["smartstack"] = marathon_smartstack_status( service, instance, job_config, service_namespace_config, tasks, should_return_individual_backends=verbose > 0, ) if include_mesos: mstatus["mesos"] = marathon_mesos_status(service, instance, verbose) return mstatus
def status_marathon_job_verbose( service: str, instance: str, clients: marathon_tools.MarathonClients, cluster: str, soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, dashboards: Dict[marathon_tools.MarathonClient, str], ) -> Tuple[List[MarathonTask], str]: """Returns detailed information about a marathon apps for a service and instance. Does not make assumptions about what the *exact* appid is, but instead does a fuzzy match on any marathon apps that match the given service.instance""" all_tasks: List[MarathonTask] = [] all_output: List[str] = [] # For verbose mode, we want to see *any* matching app. As it may # not be the one that we think should be deployed. For example # during a bounce we want to see the old and new ones. marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service(job_config), embed_tasks=True, ) autoscaling_info = get_autoscaling_info(clients, job_config) if autoscaling_info: all_output.append(" Autoscaling Info:") headers = [ field.replace("_", " ").capitalize() for field in ServiceAutoscalingInfo._fields ] table = [headers, autoscaling_info] all_output.append('\n'.join( [" %s" % line for line in format_table(table)])) for app, client in marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients): tasks, output = get_verbose_status_of_marathon_app( marathon_client=client, app=app, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, ) all_tasks.extend(tasks) all_output.append(output) return all_tasks, "\n".join(all_output)
def get_autoscaling_info(marathon_clients, service_config): if service_config.get_max_instances() and service_config.get_desired_state( ) == 'start': apps_with_clients = get_marathon_apps_with_clients( marathon_clients.get_all_clients(), embed_tasks=True) all_mesos_tasks = get_all_running_tasks() autoscaling_params = service_config.get_autoscaling_params() autoscaling_params.update({'noop': True}) system_paasta_config = load_system_paasta_config() try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, service_config, ) utilization = get_utilization( marathon_service_config=service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data={}, marathon_tasks=list(marathon_tasks.values()), mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params['setpoint'], current_instances=service_config.get_instances(), ) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=service_config.get_instances(), marathon_service_config=service_config, num_healthy_instances=len(marathon_tasks), ) current_utilization = "{:.1f}%".format(utilization * 100) except MetricsProviderNoDataError: current_utilization = "Exception" new_instance_count = "Exception" return ServiceAutoscalingInfo( current_instances=str(service_config.get_instances()), max_instances=str(service_config.get_max_instances()), min_instances=str(service_config.get_min_instances()), current_utilization=current_utilization, target_instances=str(new_instance_count), ) return None
def process_service_instance(self, service_instance): bounce_timers = self.setup_timers(service_instance) self.log.info("{} processing {}.{}".format(self.name, service_instance.service, service_instance.instance)) # TODO: change this to use get_all_clients_for_service() instead. unique_clients = self.marathon_clients.get_all_clients() marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( unique_clients, embed_tasks=True) bounce_timers.setup_marathon.start() return_code, bounce_again_in_seconds = deploy_marathon_service( service=service_instance.service, instance=service_instance.instance, clients=self.marathon_clients, soa_dir=marathon_tools.DEFAULT_SOA_DIR, marathon_apps_with_clients=marathon_apps_with_clients, ) bounce_timers.setup_marathon.stop() self.log.info( "setup marathon completed with exit code {} for {}.{}".format( return_code, service_instance.service, service_instance.instance, )) if bounce_again_in_seconds: bounce_timers.processed_by_worker.start() self.log.info("{}.{} not in steady state so bouncing again in {} " "seconds".format( service_instance.service, service_instance.instance, bounce_again_in_seconds, )) else: bounce_timers.bounce_length.stop() self.log.info("{}.{} in steady state".format( service_instance.service, service_instance.instance, )) return BounceResults(bounce_again_in_seconds, return_code, bounce_timers)
def deploy_marathon_service( service: str, instance: str, clients: marathon_tools.MarathonClients, soa_dir: str, marathon_apps_with_clients: Optional[Collection[Tuple[MarathonApp, MarathonClient]]], ) -> Tuple[int, float]: """deploy the service instance given and proccess return code if there was an error we send a sensu alert. :param service: The service name to setup :param instance: The instance of the service to setup :param clients: A MarathonClients object :param soa_dir: Path to yelpsoa configs :param marathon_apps: A list of all marathon app objects :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd bounce_in_seconds instructs how long until the deployd should try another bounce None means that it is in a steady state and doesn't need to bounce again """ short_id = marathon_tools.format_job_id(service, instance) try: with bounce_lib.bounce_lock_zookeeper(short_id): try: service_instance_config = marathon_tools.load_marathon_service_config_no_cache( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, load_system_paasta_config().get_cluster())) return 0, None except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \ (service, instance, load_system_paasta_config().get_cluster()) log.error(error_msg) return 1, None if marathon_apps_with_clients is None: marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service( job_config=service_instance_config), embed_tasks=True, ) try: with a_sync.idle_event_loop(): status, output, bounce_again_in_seconds = setup_service( service=service, instance=instance, clients=clients, job_config=service_instance_config, marathon_apps_with_clients=marathon_apps_with_clients, soa_dir=soa_dir, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) return 0, bounce_again_in_seconds except (KeyError, TypeError, AttributeError, InvalidInstanceConfig, NoSlavesAvailableError): error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) return 1, None except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return 0, None
def status_marathon_job( service: str, instance: str, cluster: str, soa_dir: str, dashboards: Dict[marathon_tools.MarathonClient, str], normal_instance_count: int, clients: marathon_tools.MarathonClients, job_config: marathon_tools.MarathonServiceConfig, desired_app_id: str, verbose: int, ) -> Tuple[List[MarathonTask], str]: marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service(job_config), embed_tasks=True, service_name=service, ) all_tasks = [] all_output = [ "" ] # One entry that will be replaced with status_marathon_job_human output later. running_instances = 0 if verbose > 0: autoscaling_info = get_autoscaling_info(marathon_apps_with_clients, job_config) if autoscaling_info: all_output.append(" Autoscaling Info:") headers = [ field.replace("_", " ").capitalize() for field in ServiceAutoscalingInfo._fields ] table = [headers, humanize_autoscaling_info(autoscaling_info)] all_output.append( "\n".join([" %s" % line for line in format_table(table)]) ) deploy_status_for_desired_app = "Waiting for bounce" matching_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients ) for app, client in matching_apps_with_clients: all_tasks.extend(app.tasks) ( deploy_status_for_current_app, running_instances_for_current_app, out, ) = status_marathon_app( marathon_client=client, app=app, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, verbose=verbose, ) if app.id.lstrip("/") == desired_app_id.lstrip("/"): deploy_status_for_desired_app = marathon_tools.MarathonDeployStatus.tostring( deploy_status_for_current_app ) running_instances += running_instances_for_current_app all_output.append(out) all_output[0] = status_marathon_job_human( service=service, instance=instance, deploy_status=deploy_status_for_desired_app, desired_app_id=desired_app_id, app_count=len(matching_apps_with_clients), running_instances=running_instances, normal_instance_count=normal_instance_count, ) return all_tasks, "\n".join(all_output)