def check_service_replication( instance_config: MarathonServiceConfig, all_tasks_or_pods: Sequence[MarathonTask], replication_checker: MesosSmartstackEnvoyReplicationChecker, ) -> Optional[bool]: """Checks a service's replication levels based on how the service's replication should be monitored. (smartstack/envoy or mesos) :param instance_config: an instance of MarathonServiceConfig :param replication_checker: an instance of MesosSmartstackEnvoyReplicationChecker """ expected_count = instance_config.get_instances() log.info("Expecting %d total tasks for %s" % (expected_count, instance_config.job_id)) proxy_port = get_proxy_port_for_instance(instance_config) registrations = instance_config.get_registrations() # if the primary registration does not match the service_instance name then # the best we can do is check marathon for replication (for now). if proxy_port is not None and registrations[0] == instance_config.job_id: is_well_replicated = monitoring_tools.check_replication_for_instance( instance_config=instance_config, expected_count=expected_count, replication_checker=replication_checker, ) return is_well_replicated else: check_healthy_marathon_tasks_for_service_instance( instance_config=instance_config, expected_count=expected_count, all_tasks=all_tasks_or_pods, ) return None
def status_desired_state( service: str, instance: str, client: marathon_tools.MarathonClient, job_config: marathon_tools.MarathonServiceConfig, ) -> str: status = get_bouncing_status(service, instance, client, job_config) desired_state = desired_state_human(job_config.get_desired_state(), job_config.get_instances()) return f"Desired State: {status} and {desired_state}"
def autoscale_marathon_instance( marathon_service_config: MarathonServiceConfig, system_paasta_config: SystemPaastaConfig, marathon_tasks: Sequence[MarathonTask], mesos_tasks: Sequence[Task], ) -> None: try: with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance): current_instances = marathon_service_config.get_instances() task_data_insufficient = is_task_data_insufficient( marathon_service_config=marathon_service_config, marathon_tasks=marathon_tasks, current_instances=current_instances, ) autoscaling_params = marathon_service_config.get_autoscaling_params( ) log_utilization_data: Mapping = {} utilization = get_utilization( marathon_service_config=marathon_service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data=log_utilization_data, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params["setpoint"], current_instances=current_instances, ) num_healthy_instances = len(marathon_tasks) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=current_instances, marathon_service_config=marathon_service_config, num_healthy_instances=num_healthy_instances, persist_data=(not task_data_insufficient), ) safe_downscaling_threshold = int(current_instances * 0.7) _record_autoscaling_decision( marathon_service_config=marathon_service_config, autoscaling_params=autoscaling_params, utilization=utilization, log_utilization_data=log_utilization_data, error=error, current_instances=current_instances, num_healthy_instances=num_healthy_instances, new_instance_count=new_instance_count, safe_downscaling_threshold=safe_downscaling_threshold, task_data_insufficient=task_data_insufficient, ) if new_instance_count != current_instances: if new_instance_count < current_instances and task_data_insufficient: write_to_log( config=marathon_service_config, line= "Delaying scaling *down* as we found too few healthy tasks running in marathon. " "This can happen because tasks are delayed/waiting/unhealthy or because we are " "waiting for tasks to be killed. Will wait for sufficient healthy tasks before " "we make a decision to scale down.", level="debug", ) return else: set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) write_to_log( config=marathon_service_config, line="Scaling from %d to %d instances (%s)" % ( current_instances, new_instance_count, humanize_error(error), ), level="event", ) else: write_to_log( config=marathon_service_config, line="Staying at %d instances (%s)" % (current_instances, humanize_error(error)), level="debug", ) except LockHeldException: log.warning( "Skipping autoscaling run for {service}.{instance} because the lock is held" .format( service=marathon_service_config.service, instance=marathon_service_config.instance, ))
def perform_command( command: str, service: str, instance: str, cluster: str, verbose: int, soa_dir: str, clients: marathon_tools.MarathonClients, job_config: marathon_tools.MarathonServiceConfig, app_id: str = None, ) -> int: """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :param client: MarathonClient or CachingMarathonClient :returns: A unix-style return code """ system_config = load_system_paasta_config() if not app_id: try: app_id = job_config.format_marathon_app_dict()["id"] except NoDockerImageError: job_id = compose_job_id(service, instance) paasta_print( "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id ) return 1 normal_instance_count = job_config.get_instances() current_client = clients.get_current_client_for_service(job_config) if command == "restart": restart_marathon_job(service, instance, app_id, current_client, cluster) elif command == "status": paasta_print( status_desired_state(service, instance, current_client, job_config) ) dashboards = get_marathon_dashboard_links(clients, system_config) tasks, out = status_marathon_job( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, normal_instance_count=normal_instance_count, clients=clients, job_config=job_config, desired_app_id=app_id, verbose=verbose, ) paasta_print(out) service_namespace_config = marathon_tools.load_service_namespace_config( service=service, namespace=job_config.get_nerve_namespace(), soa_dir=soa_dir ) paasta_print( status_mesos_tasks(service, instance, normal_instance_count, verbose) ) proxy_port = service_namespace_config.get("proxy_port") if proxy_port is not None: normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace( service, instance, cluster ) paasta_print( status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, service_namespace_config=service_namespace_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose > 0, synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(), system_deploy_blacklist=system_config.get_deploy_blacklist(), system_deploy_whitelist=system_config.get_deploy_whitelist(), ) ) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def autoscale_marathon_instance( marathon_service_config: MarathonServiceConfig, system_paasta_config: SystemPaastaConfig, marathon_tasks: Sequence[MarathonTask], mesos_tasks: Sequence[Task], ) -> None: try: with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance): current_instances = marathon_service_config.get_instances() task_data_insufficient = is_task_data_insufficient( marathon_service_config=marathon_service_config, marathon_tasks=marathon_tasks, current_instances=current_instances, ) autoscaling_params = marathon_service_config.get_autoscaling_params() log_utilization_data: Mapping = {} utilization = get_utilization( marathon_service_config=marathon_service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data=log_utilization_data, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params['setpoint'], current_instances=current_instances, ) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=current_instances, marathon_service_config=marathon_service_config, num_healthy_instances=len(marathon_tasks), ) safe_downscaling_threshold = int(current_instances * 0.7) if new_instance_count != current_instances: if new_instance_count < current_instances and task_data_insufficient: write_to_log( config=marathon_service_config, line='Delaying scaling *down* as we found too few healthy tasks running in marathon. ' 'This can happen because tasks are delayed/waiting/unhealthy or because we are ' 'waiting for tasks to be killed. Will wait for sufficient healthy tasks before ' 'we make a decision to scale down.', ) return if new_instance_count == safe_downscaling_threshold: write_to_log( config=marathon_service_config, line='Autoscaler clamped: %s' % str(log_utilization_data), level='debug', ) write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % ( current_instances, new_instance_count, humanize_error(error), ), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', ) meteorite_dims = { 'service_name': marathon_service_config.service, 'decision_policy': autoscaling_params[DECISION_POLICY_KEY], # type: ignore 'paasta_cluster': marathon_service_config.cluster, 'instance_name': marathon_service_config.instance, } if yelp_meteorite: gauge = yelp_meteorite.create_gauge('paasta.service.instances', meteorite_dims) gauge.set(new_instance_count) gauge = yelp_meteorite.create_gauge('paasta.service.max_instances', meteorite_dims) gauge.set(marathon_service_config.get_max_instances()) gauge = yelp_meteorite.create_gauge('paasta.service.min_instances', meteorite_dims) gauge.set(marathon_service_config.get_min_instances()) except LockHeldException: log.warning("Skipping autoscaling run for {service}.{instance} because the lock is held".format( service=marathon_service_config.service, instance=marathon_service_config.instance, ))
def marathon_job_status( service: str, instance: str, job_config: marathon_tools.MarathonServiceConfig, marathon_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]], verbose: int, ) -> MutableMapping[str, Any]: job_status_fields: MutableMapping[str, Any] = { "app_statuses": [], "app_count": len(marathon_apps_with_clients), "desired_state": job_config.get_desired_state(), "bounce_method": job_config.get_bounce_method(), "expected_instance_count": job_config.get_instances(), "active_shas": list(get_active_shas_for_marathon_apps(marathon_apps_with_clients)), } try: desired_app_id = job_config.format_marathon_app_dict()["id"] except NoDockerImageError: error_msg = "Docker image is not in deployments.json." job_status_fields["error_message"] = error_msg return job_status_fields job_status_fields["desired_app_id"] = desired_app_id deploy_status_for_desired_app = None dashboard_links = get_marathon_dashboard_links( settings.marathon_clients, settings.system_paasta_config) tasks_running = 0 for app, marathon_client in marathon_apps_with_clients: deploy_status = marathon_tools.get_marathon_app_deploy_status( marathon_client, app) app_status = marathon_app_status( app, marathon_client, dashboard_links.get(marathon_client) if dashboard_links else None, deploy_status, list_tasks=verbose > 0, ) job_status_fields["app_statuses"].append(app_status) if app.id.lstrip("/") == desired_app_id.lstrip("/"): deploy_status_for_desired_app = marathon_tools.MarathonDeployStatus.tostring( deploy_status) tasks_running += app.tasks_running job_status_fields["deploy_status"] = (deploy_status_for_desired_app or "Waiting for bounce") job_status_fields["running_instance_count"] = tasks_running if verbose > 0: autoscaling_info = get_autoscaling_info(marathon_apps_with_clients, job_config) if autoscaling_info is not None: autoscaling_info_dict = autoscaling_info._asdict() for field in ("current_utilization", "target_instances"): if autoscaling_info_dict[field] is None: del autoscaling_info_dict[field] job_status_fields["autoscaling_info"] = autoscaling_info_dict return job_status_fields