Exemple #1
0
def setup_service(
    service: str,
    instance: str,
    clients: marathon_tools.MarathonClients,
    job_config: marathon_tools.MarathonServiceConfig,
    marathon_apps_with_clients: Sequence[Tuple[MarathonApp, MarathonClient]],
    soa_dir: str,
) -> Tuple[int, str, Optional[float]]:
    """Setup the service instance given and attempt to deploy it, if possible.
    Doesn't do anything if the service is already in Marathon and hasn't changed.
    If it's not, attempt to find old instances of the service and bounce them.

    :param service: The service name to setup
    :param instance: The instance of the service to setup
    :param clients: A MarathonClients object
    :param job_config: The service instance's configuration dict
    :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event"""

    log.info("Setting up instance %s for service %s", instance, service)
    try:
        marathon_app_dict = job_config.format_marathon_app_dict()
    except NoDockerImageError:
        error_msg = (
            "Docker image for {0}.{1} not in deployments.json. Exiting. Has Jenkins deployed it?\n"
        ).format(
            service,
            instance,
        )
        log.error(error_msg)
        return (1, error_msg, None)

    full_id = marathon_app_dict['id']
    service_namespace_config = marathon_tools.load_service_namespace_config(
        service=service,
        namespace=job_config.get_nerve_namespace(),
        soa_dir=soa_dir,
    )

    log.info("Desired Marathon instance id: %s", full_id)
    return deploy_service(
        service=service,
        instance=instance,
        marathon_jobid=full_id,
        config=marathon_app_dict,
        clients=clients,
        marathon_apps_with_clients=marathon_apps_with_clients,
        bounce_method=job_config.get_bounce_method(),
        drain_method_name=job_config.get_drain_method(
            service_namespace_config),
        drain_method_params=job_config.get_drain_method_params(
            service_namespace_config),
        nerve_ns=job_config.get_nerve_namespace(),
        registrations=job_config.get_registrations(),
        bounce_health_params=job_config.get_bounce_health_params(
            service_namespace_config),
        soa_dir=soa_dir,
        job_config=job_config,
        bounce_margin_factor=job_config.get_bounce_margin_factor(),
    )
def filter_autoscaling_tasks(
    marathon_apps: Sequence[MarathonApp],
    all_mesos_tasks: Sequence[Task],
    config: MarathonServiceConfig,
    system_paasta_config: SystemPaastaConfig,
) -> Tuple[Mapping[str, MarathonTask], Sequence[Task]]:
    """Find the tasks that are serving traffic. We care about this because many tasks have a period of high CPU when
    they first start up, during which they warm up code, load and process data, etc., and we don't want this high load
    to drag our overall load estimate upwards. Allowing these tasks to count towards overall load could cause a cycle of
    scaling up, seeing high load due to new warming-up containers, scaling up, until we hit max_instances.

    However, accidentally omitting a task that actually is serving traffic will cause us to underestimate load; this is
    generally much worse than overestimating, since it can cause us to incorrectly scale down or refuse to scale up when
    necessary. For this reason, we look at several sources of health information, and if they disagree, assume the task
    is serving traffic.
    """
    job_id_prefix = "{}{}".format(
        format_job_id(service=config.service, instance=config.instance),
        MESOS_TASK_SPACER,
    )

    # Get a dict of healthy tasks, we assume tasks with no healthcheck defined are healthy.
    # We assume tasks with no healthcheck results but a defined healthcheck to be unhealthy, unless they are "old" in
    # which case we assume that Marathon has screwed up and stopped healthchecking but that they are healthy.

    log.info("Inspecting %s for autoscaling" % job_id_prefix)

    relevant_tasks_by_app: Dict[MarathonApp, List[MarathonTask]] = {
        app: app.tasks
        for app in marathon_apps
        if app.id.lstrip("/").startswith(job_id_prefix)
    }

    healthy_marathon_tasks: Dict[str, MarathonTask] = {}

    for app, tasks in relevant_tasks_by_app.items():
        for task in tasks:
            if (is_task_healthy(task) or not app.health_checks
                    or is_old_task_missing_healthchecks(task, app)):
                healthy_marathon_tasks[task.id] = task

    service_namespace_config = load_service_namespace_config(
        service=config.service, namespace=config.get_nerve_namespace())
    if service_namespace_config.is_in_smartstack():

        for task in filter_tasks_in_smartstack(
                tasks=[
                    task for tasks in relevant_tasks_by_app.values()
                    for task in tasks
                ],
                service=config.service,
                nerve_ns=config.get_nerve_namespace(),
                system_paasta_config=system_paasta_config,
                max_hosts_to_query=20,
                haproxy_min_fraction_up=
                0.01,  # Be very liberal. See docstring above for rationale.
        ):
            healthy_marathon_tasks[task.id] = task

    if not healthy_marathon_tasks:
        raise MetricsProviderNoDataError(
            "Couldn't find any healthy marathon tasks")
    mesos_tasks = [
        task for task in all_mesos_tasks
        if task["id"] in healthy_marathon_tasks
    ]
    return (healthy_marathon_tasks, mesos_tasks)
def perform_command(
    command: str,
    service: str,
    instance: str,
    cluster: str,
    verbose: int,
    soa_dir: str,
    clients: marathon_tools.MarathonClients,
    job_config: marathon_tools.MarathonServiceConfig,
    app_id: str = None,
) -> int:
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :param client: MarathonClient or CachingMarathonClient
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()["id"]
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            paasta_print(
                "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?"
                % job_id
            )
            return 1

    normal_instance_count = job_config.get_instances()

    current_client = clients.get_current_client_for_service(job_config)

    if command == "restart":
        restart_marathon_job(service, instance, app_id, current_client, cluster)
    elif command == "status":
        paasta_print(
            status_desired_state(service, instance, current_client, job_config)
        )
        dashboards = get_marathon_dashboard_links(clients, system_config)
        tasks, out = status_marathon_job(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
            dashboards=dashboards,
            normal_instance_count=normal_instance_count,
            clients=clients,
            job_config=job_config,
            desired_app_id=app_id,
            verbose=verbose,
        )
        paasta_print(out)
        service_namespace_config = marathon_tools.load_service_namespace_config(
            service=service, namespace=job_config.get_nerve_namespace(), soa_dir=soa_dir
        )

        paasta_print(
            status_mesos_tasks(service, instance, normal_instance_count, verbose)
        )

        proxy_port = service_namespace_config.get("proxy_port")
        if proxy_port is not None:
            normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
                service, instance, cluster
            )
            paasta_print(
                status_smartstack_backends(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    job_config=job_config,
                    service_namespace_config=service_namespace_config,
                    tasks=tasks,
                    expected_count=normal_smartstack_count,
                    soa_dir=soa_dir,
                    verbose=verbose > 0,
                    synapse_port=system_config.get_synapse_port(),
                    synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
                    system_deploy_blacklist=system_config.get_deploy_blacklist(),
                    system_deploy_whitelist=system_config.get_deploy_whitelist(),
                )
            )
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0