Beispiel #1
0
def emit_replication_metrics(
    replication_infos: Mapping[str, Mapping[str, Mapping[str, int]]],
    instance_config: LongRunningServiceConfig,
    expected_count: int,
) -> None:
    for provider, replication_info in replication_infos.items():
        meteorite_dims = {
            "paasta_service": instance_config.service,
            "paasta_cluster": instance_config.cluster,
            "paasta_instance": instance_config.instance,
            "paasta_pool": instance_config.get_pool(),
            "service_discovery_provider": provider,
        }

        num_available_backends = 0
        for available_backends in replication_info.values():
            num_available_backends += available_backends.get(
                instance_config.job_id, 0)
        available_backends_gauge = yelp_meteorite.create_gauge(
            "paasta.service.available_backends", meteorite_dims)
        available_backends_gauge.set(num_available_backends)

        critical_percentage = instance_config.get_replication_crit_percentage()
        num_critical_backends = critical_percentage * expected_count / 100.0
        critical_backends_gauge = yelp_meteorite.create_gauge(
            "paasta.service.critical_backends", meteorite_dims)
        critical_backends_gauge.set(num_critical_backends)

        expected_backends_gauge = yelp_meteorite.create_gauge(
            "paasta.service.expected_backends", meteorite_dims)
        expected_backends_gauge.set(expected_count)
Beispiel #2
0
def check_under_replication(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    num_available: int,
    sub_component: Optional[str] = None,
) -> Tuple[bool, str, str]:
    """Check if a component/sub_component is under-replicated and returns both the result of the check in the form of a
    boolean and a human-readable text to be used in logging or monitoring events.
    """
    crit_threshold = instance_config.get_replication_crit_percentage()

    # Keep output short, with rest of context in description. This is because
    # by default, Slack-Sensu messages have a 400 char limit, incl. the output.
    # If it is too long, the runbook and tip won't show up.
    if sub_component is not None:
        output = ("{} has {}/{} replicas of {} available (threshold: {}%)").format(
            instance_config.job_id,
            num_available,
            expected_count,
            sub_component,
            crit_threshold,
        )
    else:
        output = ("{} has {}/{} replicas available (threshold: {}%)").format(
            instance_config.job_id, num_available, expected_count, crit_threshold
        )

    under_replicated, _ = is_under_replicated(
        num_available, expected_count, crit_threshold
    )
    if under_replicated:
        description = (
            "This replication alert means that PaaSTA can't keep the\n"
            "requested number of replicas up and healthy in the cluster for "
            "the instance {service}.{instance}.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply be unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s {service} -i {instance} -c {cluster} -vv\n"
        ).format(
            service=instance_config.service,
            instance=instance_config.instance,
            cluster=instance_config.cluster,
        )
    else:
        description = (
            "{} is well-replicated because it has over {}% of its "
            "expected replicas up."
        ).format(instance_config.job_id, crit_threshold)
    return under_replicated, output, description
Beispiel #3
0
def check_under_replication(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    num_available: int,
    sub_component: Optional[str] = None,
) -> Tuple[bool, str]:
    """Check if a component/sub_component is under-replicated and returns both the result of the check in the form of a
    boolean and a human-readable text to be used in logging or monitoring events.
    """
    crit_threshold = instance_config.get_replication_crit_percentage()
    if sub_component is not None:
        output = (
            "Service %s has %d out of %d expected instances of %s available! (threshold: %d%%)"
        ) % (
            instance_config.job_id,
            num_available,
            expected_count,
            sub_component,
            crit_threshold,
        )
    else:
        output = (
            "Service %s has %d out of %d expected instances available! (threshold: %d%%)"
        ) % (instance_config.job_id, num_available, expected_count,
             crit_threshold)
    under_replicated, _ = is_under_replicated(num_available, expected_count,
                                              crit_threshold)
    if under_replicated:
        output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that the service PaaSTA can't keep the\n"
            "  requested number of copies up and healthy in the cluster.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply be unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            "service": instance_config.service,
            "instance": instance_config.instance,
            "cluster": instance_config.cluster,
        }
    return under_replicated, output
Beispiel #4
0
def check_replication_for_instance(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    replication_checker: ReplicationChecker,
) -> bool:
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param instance_config: an instance of MarathonServiceConfig
    :param replication_checker: an instance of ReplicationChecker
    """

    crit_threshold = instance_config.get_replication_crit_percentage()

    log.info("Checking instance %s in service discovery providers",
             instance_config.job_id)
    replication_infos = replication_checker.get_replication_for_instance(
        instance_config)

    log.debug(
        f"Got replication info for {instance_config.job_id}: {replication_infos}"
    )
    if yelp_meteorite is not None:
        emit_replication_metrics(
            replication_infos,
            instance_config,
            expected_count,
        )

    combined_output = ""
    service_is_under_replicated = False
    failed_service_discovery_providers = set()
    for service_discovery_provider, replication_info in replication_infos.items(
    ):
        if len(replication_info) == 0:
            output = (
                "Service %s has no %s replication info. Make sure the discover key in the corresponding config (e.g. smartstack.yaml for Smartstack) is valid!\n"
            ) % (instance_config.job_id, service_discovery_provider)
            log.error(output)
            service_is_under_replicated = True
            failed_service_discovery_providers.add(service_discovery_provider)
        else:
            expected_count_per_location = int(expected_count /
                                              len(replication_info))
            output = ""
            output_critical = ""
            output_ok = ""
            under_replication_per_location = []

            for location, available_backends in sorted(
                    replication_info.items()):
                num_available_in_location = available_backends.get(
                    instance_config.job_id, 0)
                under_replicated, ratio = is_under_replicated(
                    num_available_in_location,
                    expected_count_per_location,
                    crit_threshold,
                )
                if under_replicated:
                    output_critical += (
                        "- Service %s has %d out of %d expected instances in %s according to %s (CRITICAL: %d%%)\n"
                        % (
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                    failed_service_discovery_providers.add(
                        service_discovery_provider)
                else:
                    output_ok += (
                        "- Service %s has %d out of %d expected instances in %s according to %s (OK: %d%%)\n"
                        % (
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                under_replication_per_location.append(under_replicated)

            output += output_critical
            if output_critical and output_ok:
                output += "\n\n"
                output += "The following locations are OK:\n"
            output += output_ok

            service_is_under_replicated_anywhere = any(
                under_replication_per_location)
            service_is_under_replicated |= service_is_under_replicated_anywhere
            if service_is_under_replicated_anywhere:
                log.error(output)
            else:
                log.info(output)
        combined_output += output

    if service_is_under_replicated:
        failed_service_discovery_providers_list = ",".join(
            failed_service_discovery_providers)
        combined_output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that a %(service_discovery_provider)s powered loadbalancer\n"
            "  doesn't have enough healthy backends. Not having enough healthy backends\n"
            "  means that clients of that service will get 503s (http) or connection refused\n"
            "  (tcp) when trying to connect to it.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply not have enough copies or it could simply be\n"
            "  unhealthy in that location. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * You can view the logs for the job with:\n"
            "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
            "\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
            "\n"
            "  * Widen %(service_discovery_provider)s discovery settings\n"
            "  * Increase the instance count\n"
            "\n") % {
                "service": instance_config.service,
                "instance": instance_config.instance,
                "cluster": instance_config.cluster,
                "service_discovery_provider":
                failed_service_discovery_providers_list,
            }
        status = pysensu_yelp.Status.CRITICAL
    else:
        status = pysensu_yelp.Status.OK

    send_replication_event(instance_config=instance_config,
                           status=status,
                           output=combined_output)

    return not service_is_under_replicated
Beispiel #5
0
def check_replication_for_instance(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    replication_checker: ReplicationChecker,
    dry_run: bool = False,
) -> bool:
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param instance_config: an instance of MarathonServiceConfig
    :param replication_checker: an instance of ReplicationChecker
    :param dry_run: Print Sensu event and metrics instead of emitting them
    """

    crit_threshold = instance_config.get_replication_crit_percentage()

    log.info("Checking instance %s in service discovery providers",
             instance_config.job_id)
    replication_infos = replication_checker.get_replication_for_instance(
        instance_config)

    log.debug(
        f"Got replication info for {instance_config.job_id}: {replication_infos}"
    )
    if yelp_meteorite is not None:
        emit_replication_metrics(
            replication_infos,
            instance_config,
            expected_count,
            dry_run=dry_run,
        )

    service_is_under_replicated = False
    failed_service_discovery_providers = set()
    for service_discovery_provider, replication_info in replication_infos.items(
    ):
        if len(replication_info) == 0:
            output = (
                "Service %s has no %s replication info. Make sure the discover key in the corresponding config (e.g. smartstack.yaml for Smartstack) is valid!\n"
            ) % (instance_config.job_id, service_discovery_provider)
            log.error(output)
            service_is_under_replicated = True
            failed_service_discovery_providers.add(service_discovery_provider)
        else:
            expected_count_per_location = int(expected_count /
                                              len(replication_info))
            output_critical = []
            output_ok = []
            under_replication_per_location = []

            for location, available_backends in sorted(
                    replication_info.items()):
                num_available_in_location = available_backends.get(
                    instance_config.job_id, 0)
                under_replicated, ratio = is_under_replicated(
                    num_available_in_location,
                    expected_count_per_location,
                    crit_threshold,
                )
                if under_replicated:
                    output_critical.append(
                        "{} has {}/{} replicas in {} according to {} (CRITICAL: {}%)\n"
                        .format(
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                    failed_service_discovery_providers.add(
                        service_discovery_provider)
                else:
                    output_ok.append(
                        "{} has {}/{} replicas in {} according to {} (OK: {}%)\n"
                        .format(
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                under_replication_per_location.append(under_replicated)

            output = ", ".join(output_critical)
            if output_critical and output_ok:
                output += ". The following locations are OK: "
            output += ", ".join(output_ok)

            service_is_under_replicated_anywhere = any(
                under_replication_per_location)
            service_is_under_replicated |= service_is_under_replicated_anywhere
            if service_is_under_replicated_anywhere:
                log.error(output)
            else:
                log.info(output)

    if service_is_under_replicated:
        failed_service_discovery_providers_list = ",".join(
            failed_service_discovery_providers)
        description = (
            "This replication alert means that a {service_discovery_provider} powered loadbalancer\n"
            "doesn't have enough healthy backends. Not having enough healthy backends\n"
            "means that clients of that service will get 503s (http) or connection refused\n"
            "(tcp) when trying to connect to it.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply not have enough copies or it could simply be\n"
            "  unhealthy in that location. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * You can view the logs for the job with:\n"
            "      paasta logs -s {service} -i {instance} -c {cluster}\n"
            "\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s {service} -i {instance} -c {cluster} -vv\n"
            "\n"
            "  * Widen {service_discovery_provider} discovery settings\n"
            "  * Increase the instance count\n"
            "\n"
        ).format(
            service=instance_config.service,
            instance=instance_config.instance,
            cluster=instance_config.cluster,
            service_discovery_provider=failed_service_discovery_providers_list,
        )
        status = pysensu_yelp.Status.CRITICAL
    else:
        description = ("{} is well-replicated because it has over {}% of its "
                       "expected replicas up.").format(instance_config.job_id,
                                                       crit_threshold)
        status = pysensu_yelp.Status.OK

    send_replication_event(
        instance_config=instance_config,
        status=status,
        output=output,
        description=description,
        dry_run=dry_run,
    )
    return not service_is_under_replicated