def status_marathon_job_verbose(service, instance, clients, cluster, soa_dir, job_config): """Returns detailed information about a marathon apps for a service and instance. Does not make assumptions about what the *exact* appid is, but instead does a fuzzy match on any marathon apps that match the given service.instance""" all_tasks = [] all_output = [] # For verbose mode, we want to see *any* matching app. As it may # not be the one that we think should be deployed. For example # during a bounce we want to see the old and new ones. relevant_clients = clients.get_all_clients_for_service(job_config) marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( relevant_clients, embed_tasks=True) for app, client in marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients): tasks, output = get_verbose_status_of_marathon_app( marathon_client=client, app=app, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) all_tasks.extend(tasks) all_output.append(output) return all_tasks, "\n".join(all_output)
def marathon_instance_status( instance_status: Mapping[str, Any], service: str, instance: str, verbose: int, include_smartstack: bool, include_mesos: bool, ) -> Mapping[str, Any]: mstatus: Dict[str, Any] = {} job_config = marathon_tools.load_marathon_service_config( service, instance, settings.cluster, soa_dir=settings.soa_dir ) marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=settings.marathon_clients.get_all_clients_for_service(job_config), embed_tasks=True, service_name=service, ) matching_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients ) mstatus.update( marathon_job_status( service, instance, job_config, matching_apps_with_clients, verbose ) ) if include_smartstack: service_namespace_config = marathon_tools.load_service_namespace_config( service=service, namespace=job_config.get_nerve_namespace(), soa_dir=settings.soa_dir, ) if "proxy_port" in service_namespace_config: tasks = [ task for app, _ in matching_apps_with_clients for task in app.tasks ] mstatus["smartstack"] = marathon_smartstack_status( service, instance, job_config, service_namespace_config, tasks, should_return_individual_backends=verbose > 0, ) if include_mesos: mstatus["mesos"] = marathon_mesos_status(service, instance, verbose) return mstatus
def status_marathon_job_verbose( service: str, instance: str, clients: marathon_tools.MarathonClients, cluster: str, soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, dashboards: Dict[marathon_tools.MarathonClient, str], ) -> Tuple[List[MarathonTask], str]: """Returns detailed information about a marathon apps for a service and instance. Does not make assumptions about what the *exact* appid is, but instead does a fuzzy match on any marathon apps that match the given service.instance""" all_tasks: List[MarathonTask] = [] all_output: List[str] = [] # For verbose mode, we want to see *any* matching app. As it may # not be the one that we think should be deployed. For example # during a bounce we want to see the old and new ones. marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service(job_config), embed_tasks=True, ) autoscaling_info = get_autoscaling_info(clients, job_config) if autoscaling_info: all_output.append(" Autoscaling Info:") headers = [ field.replace("_", " ").capitalize() for field in ServiceAutoscalingInfo._fields ] table = [headers, autoscaling_info] all_output.append('\n'.join( [" %s" % line for line in format_table(table)])) for app, client in marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients): tasks, output = get_verbose_status_of_marathon_app( marathon_client=client, app=app, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, ) all_tasks.extend(tasks) all_output.append(output) return all_tasks, "\n".join(all_output)
def deploy_service( service: str, instance: str, marathon_jobid: str, config: marathon_tools.FormattedMarathonAppDict, clients: marathon_tools.MarathonClients, marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]], bounce_method: str, drain_method_name: str, drain_method_params: Dict[str, Any], nerve_ns: str, bounce_health_params: Dict[str, Any], soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Tuple[int, str, Optional[float]]: """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param clients: A MarathonClients object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event""" def log_deploy_error(errormsg: str, level: str = 'event') -> None: return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance, ) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service=service, instance=instance, marathon_apps_with_clients=marathon_apps_with_clients, ) new_client = clients.get_current_client_for_service(job_config) new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = [] other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = [] for a, c in existing_apps_with_clients: if a.id == '/%s' % config['id'] and c == new_client: new_apps_with_clients_list.append((a, c)) else: other_apps_with_clients.append((a, c)) serviceinstance = "%s.%s" % (service, instance) if new_apps_with_clients_list: new_app, new_client = new_apps_with_clients_list[0] if len(new_apps_with_clients_list) != 1: raise ValueError( "Only expected one app per ID per shard; found %d" % len(new_apps_with_clients_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks( new_app, service, nerve_ns, system_paasta_config, **bounce_health_params, ) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg, None) try: draining_hosts = get_draining_hosts() except ReadTimeout as e: errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e return (1, errormsg, 60) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps_with_clients=other_apps_with_clients, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) # The first thing we need to do is take up the "slack" of old apps, to stop # them from launching new things that we are going to have to end up draining # and killing anyway. for a, c in other_apps_with_clients: marathon_tools.take_up_slack(app=a, client=c) num_at_risk_tasks = 0 if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks( new_app, draining_hosts=draining_hosts) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s up from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) new_client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( app=new_app, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[(new_app.id, new_client)] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[(new_app.id, new_client)] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy], ) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[(new_app.id, new_client)] = set( scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[(new_app.id, new_client)] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # slack represents remaining the extra remaining instances that are configured # in marathon that don't have a launched task yet. When scaling down we want to # reduce this slack so marathon doesn't get a chance to launch a new task in # that space that we will then have to drain and kill again. marathon_tools.take_up_slack(client=new_client, app=new_app) # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get((new_app.id, new_client), []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg, None) bounce_again_in_seconds = do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, clients=clients, soa_dir=soa_dir, job_config=job_config, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: logline = 'Failed to get lock to create marathon app for %s.%s' % ( service, instance) log_deploy_error(logline, level='debug') return (0, "Couldn't get marathon lock, skipping until next time", None) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % ( service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise if num_at_risk_tasks: bounce_again_in_seconds = 60 elif new_app_running: if new_app.instances > config['instances']: bounce_again_in_seconds = 60 return (0, 'Service deployed.', bounce_again_in_seconds)
def status_marathon_job( service: str, instance: str, cluster: str, soa_dir: str, dashboards: Dict[marathon_tools.MarathonClient, str], normal_instance_count: int, clients: marathon_tools.MarathonClients, job_config: marathon_tools.MarathonServiceConfig, desired_app_id: str, verbose: int, ) -> Tuple[List[MarathonTask], str]: marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service(job_config), embed_tasks=True, service_name=service, ) all_tasks = [] all_output = [ "" ] # One entry that will be replaced with status_marathon_job_human output later. running_instances = 0 if verbose > 0: autoscaling_info = get_autoscaling_info(marathon_apps_with_clients, job_config) if autoscaling_info: all_output.append(" Autoscaling Info:") headers = [ field.replace("_", " ").capitalize() for field in ServiceAutoscalingInfo._fields ] table = [headers, humanize_autoscaling_info(autoscaling_info)] all_output.append( "\n".join([" %s" % line for line in format_table(table)]) ) deploy_status_for_desired_app = "Waiting for bounce" matching_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients ) for app, client in matching_apps_with_clients: all_tasks.extend(app.tasks) ( deploy_status_for_current_app, running_instances_for_current_app, out, ) = status_marathon_app( marathon_client=client, app=app, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, verbose=verbose, ) if app.id.lstrip("/") == desired_app_id.lstrip("/"): deploy_status_for_desired_app = marathon_tools.MarathonDeployStatus.tostring( deploy_status_for_current_app ) running_instances += running_instances_for_current_app all_output.append(out) all_output[0] = status_marathon_job_human( service=service, instance=instance, deploy_status=deploy_status_for_desired_app, desired_app_id=desired_app_id, app_count=len(matching_apps_with_clients), running_instances=running_instances, normal_instance_count=normal_instance_count, ) return all_tasks, "\n".join(all_output)