Ejemplo n.º 1
0
def emit_replication_metrics(
    replication_infos: Mapping[str, Mapping[str, Mapping[str, int]]],
    instance_config: LongRunningServiceConfig,
    expected_count: int,
) -> None:
    for provider, replication_info in replication_infos.items():
        meteorite_dims = {
            "paasta_service": instance_config.service,
            "paasta_cluster": instance_config.cluster,
            "paasta_instance": instance_config.instance,
            "paasta_pool": instance_config.get_pool(),
            "service_discovery_provider": provider,
        }

        num_available_backends = 0
        for available_backends in replication_info.values():
            num_available_backends += available_backends.get(
                instance_config.job_id, 0)
        available_backends_gauge = yelp_meteorite.create_gauge(
            "paasta.service.available_backends", meteorite_dims)
        available_backends_gauge.set(num_available_backends)

        critical_percentage = instance_config.get_replication_crit_percentage()
        num_critical_backends = critical_percentage * expected_count / 100.0
        critical_backends_gauge = yelp_meteorite.create_gauge(
            "paasta.service.critical_backends", meteorite_dims)
        critical_backends_gauge.set(num_critical_backends)

        expected_backends_gauge = yelp_meteorite.create_gauge(
            "paasta.service.expected_backends", meteorite_dims)
        expected_backends_gauge.set(expected_count)
Ejemplo n.º 2
0
def smartstack_status(
    service: str,
    instance: str,
    job_config: LongRunningServiceConfig,
    service_namespace_config: ServiceNamespaceConfig,
    pods: Sequence[V1Pod],
    settings: Any,
    should_return_individual_backends: bool = False,
) -> Mapping[str, Any]:

    registration = job_config.get_registrations()[0]
    instance_pool = job_config.get_pool()

    smartstack_replication_checker = KubeSmartstackReplicationChecker(
        nodes=kubernetes_tools.get_all_nodes(settings.kubernetes_client),
        system_paasta_config=settings.system_paasta_config,
    )
    node_hostname_by_location = smartstack_replication_checker.get_allowed_locations_and_hosts(
        job_config
    )

    expected_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
        service=service,
        namespace=instance,
        cluster=settings.cluster,
        instance_type_class=KubernetesDeploymentConfig,
    )
    expected_count_per_location = int(
        expected_smartstack_count / len(node_hostname_by_location)
    )
    smartstack_status: MutableMapping[str, Any] = {
        "registration": registration,
        "expected_backends_per_location": expected_count_per_location,
        "locations": [],
    }

    for location, hosts in node_hostname_by_location.items():
        synapse_host = smartstack_replication_checker.get_first_host_in_pool(
            hosts, instance_pool
        )
        sorted_backends = sorted(
            smartstack_tools.get_backends(
                registration,
                synapse_host=synapse_host,
                synapse_port=settings.system_paasta_config.get_synapse_port(),
                synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(),
            ),
            key=lambda backend: backend["status"],
            reverse=True,  # put 'UP' backends above 'MAINT' backends
        )

        matched_backends_and_pods = match_backends_and_pods(sorted_backends, pods)
        location_dict = smartstack_tools.build_smartstack_location_dict(
            location, matched_backends_and_pods, should_return_individual_backends
        )
        smartstack_status["locations"].append(location_dict)

    return smartstack_status
Ejemplo n.º 3
0
async def job_status(
    kstatus: MutableMapping[str, Any],
    client: kubernetes_tools.KubeClient,
    job_config: LongRunningServiceConfig,
    pod_list: Sequence[V1Pod],
    replicaset_list: Sequence[V1ReplicaSet],
    verbose: int,
    namespace: str,
) -> None:
    app_id = job_config.get_sanitised_deployment_name()
    kstatus["app_id"] = app_id
    kstatus["pods"] = []
    kstatus["replicasets"] = []
    if verbose > 0:
        num_tail_lines = calculate_tail_lines(verbose)

        kstatus["pods"] = await asyncio.gather(
            *[pod_info(pod, client, num_tail_lines) for pod in pod_list])

        for replicaset in replicaset_list:
            try:
                ready_replicas = replicaset.status.ready_replicas
                if ready_replicas is None:
                    ready_replicas = 0
            except AttributeError:
                ready_replicas = 0

            kstatus["replicasets"].append({
                "name":
                replicaset.metadata.name,
                "replicas":
                replicaset.spec.replicas,
                "ready_replicas":
                ready_replicas,
                "create_timestamp":
                replicaset.metadata.creation_timestamp.timestamp(),
            })

    kstatus["expected_instance_count"] = job_config.get_instances()

    app = kubernetes_tools.get_kubernetes_app_by_name(name=app_id,
                                                      kube_client=client,
                                                      namespace=namespace)
    desired_instances = (job_config.get_instances()
                         if job_config.get_desired_state() != "stop" else 0)
    deploy_status, message = await kubernetes_tools.get_kubernetes_app_deploy_status(
        app=app,
        kube_client=client,
        desired_instances=desired_instances,
    )
    kstatus[
        "deploy_status"] = kubernetes_tools.KubernetesDeployStatus.tostring(
            deploy_status)
    kstatus["deploy_status_message"] = message
    kstatus["running_instance_count"] = (app.status.ready_replicas
                                         if app.status.ready_replicas else 0)
    kstatus["create_timestamp"] = app.metadata.creation_timestamp.timestamp()
    kstatus["namespace"] = app.metadata.namespace
Ejemplo n.º 4
0
def autoscaling_status(
    kube_client: kubernetes_tools.KubeClient,
    job_config: LongRunningServiceConfig,
    namespace: str,
):
    status = {}
    hpa = kube_client.autoscaling.read_namespaced_horizontal_pod_autoscaler(
        name=job_config.get_sanitised_deployment_name(), namespace=namespace
    )
    status["min_instances"] = hpa.spec.min_replicas
    status["max_instances"] = hpa.spec.max_replicas
    # Parse metrics sources, based on
    # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V2beta1ExternalMetricSource.md#v2beta1externalmetricsource
    metric_stats = []
    parser = HPAMetricsParser(hpa)
    if hpa.spec.metrics is not None:
        for metric_spec in hpa.spec.metrics:
            metric_stats.append(parser.parse_target(metric_spec))
    if hpa.status.current_metrics is not None:
        for metric_spec in hpa.status.current_metrics:
            metric_stats.append(parser.parse_current(metric_spec))
    status["metrics"] = metric_stats
    status["desired_replicas"] = hpa.status.desired_replicas
    status["last_scale_time"] = (
        hpa.status.last_scale_time.replace(tzinfo=pytz.UTC).isoformat()
        if getattr(hpa.status, "last_scale_time")
        else "N/A"
    )
    return status
Ejemplo n.º 5
0
    def get_allowed_locations_and_hosts(
        self, instance_config: LongRunningServiceConfig
    ) -> Dict[str, Sequence[DiscoveredHost]]:
        """Returns a dict of locations and lists of corresponding mesos slaves
        where deployment of the instance is allowed.

        :param instance_config: An instance of MarathonServiceConfig
        :returns: A dict {"uswest1-prod": [DiscoveredHost(), DiscoveredHost(), ...]}
        """
        discover_location_type = marathon_tools.load_service_namespace_config(
            service=instance_config.service,
            namespace=instance_config.get_nerve_namespace(),
            soa_dir=instance_config.soa_dir,
        ).get_discover()
        attribute_to_slaves = mesos_tools.get_mesos_slaves_grouped_by_attribute(
            slaves=self._mesos_slaves, attribute=discover_location_type
        )
        ret: Dict[str, Sequence[DiscoveredHost]] = {}
        for attr, slaves in attribute_to_slaves.items():
            ret[attr] = [
                DiscoveredHost(
                    hostname=slave["hostname"], pool=slave["attributes"]["pool"]
                )
                for slave in slaves
            ]
        return ret
Ejemplo n.º 6
0
    def get_replication_for_instance(
        self, instance_config: LongRunningServiceConfig
    ) -> Dict[str, Dict[str, Dict[str, int]]]:
        """Returns the number of registered instances in each discoverable
        location for each service dicrovery provider.

        :param instance_config: An instance of MarathonServiceConfig.
        :returns: a dict {'service_discovery_provider': {'location_type': {'service.instance': int}}}
        """
        replication_infos = {}
        for provider in self._service_discovery_providers:
            replication_info = {}
            attribute_host_dict = self.get_allowed_locations_and_hosts(instance_config)
            instance_pool = instance_config.get_pool()
            for location, hosts in attribute_host_dict.items():
                # Try to get information from all available hosts in the pool before giving up
                hostnames = self.get_hostnames_in_pool(hosts, instance_pool)
                for hostname in hostnames:
                    try:
                        replication_info[location] = self._get_replication_info(
                            location, hostname, instance_config, provider
                        )
                        break
                    except Exception as e:
                        log.warn(
                            f"Error while getting replication info for {location} from {hostname}: {e}"
                        )
                        if hostname == hostnames[-1]:
                            # Last hostname failed, giving up
                            raise
            replication_infos[provider.NAME] = replication_info
        return replication_infos
Ejemplo n.º 7
0
def check_under_replication(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    num_available: int,
    sub_component: Optional[str] = None,
) -> Tuple[bool, str, str]:
    """Check if a component/sub_component is under-replicated and returns both the result of the check in the form of a
    boolean and a human-readable text to be used in logging or monitoring events.
    """
    crit_threshold = instance_config.get_replication_crit_percentage()

    # Keep output short, with rest of context in description. This is because
    # by default, Slack-Sensu messages have a 400 char limit, incl. the output.
    # If it is too long, the runbook and tip won't show up.
    if sub_component is not None:
        output = ("{} has {}/{} replicas of {} available (threshold: {}%)").format(
            instance_config.job_id,
            num_available,
            expected_count,
            sub_component,
            crit_threshold,
        )
    else:
        output = ("{} has {}/{} replicas available (threshold: {}%)").format(
            instance_config.job_id, num_available, expected_count, crit_threshold
        )

    under_replicated, _ = is_under_replicated(
        num_available, expected_count, crit_threshold
    )
    if under_replicated:
        description = (
            "This replication alert means that PaaSTA can't keep the\n"
            "requested number of replicas up and healthy in the cluster for "
            "the instance {service}.{instance}.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply be unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s {service} -i {instance} -c {cluster} -vv\n"
        ).format(
            service=instance_config.service,
            instance=instance_config.instance,
            cluster=instance_config.cluster,
        )
    else:
        description = (
            "{} is well-replicated because it has over {}% of its "
            "expected replicas up."
        ).format(instance_config.job_id, crit_threshold)
    return under_replicated, output, description
Ejemplo n.º 8
0
async def autoscaling_status(
    kube_client: kubernetes_tools.KubeClient,
    job_config: LongRunningServiceConfig,
    namespace: str,
) -> KubernetesAutoscalingStatusDict:
    hpa = await kubernetes_tools.get_hpa(
        kube_client,
        name=job_config.get_sanitised_deployment_name(),
        namespace=namespace,
    )
    if hpa is None:
        return KubernetesAutoscalingStatusDict(
            min_instances=-1,
            max_instances=-1,
            metrics=[],
            desired_replicas=-1,
            last_scale_time="unknown (could not find HPA object)",
        )

    # Parse metrics sources, based on
    # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V2beta2ExternalMetricSource.md#v2beta2externalmetricsource
    parser = HPAMetricsParser(hpa)

    # https://github.com/python/mypy/issues/7217
    metrics_by_name: DefaultDict[str, HPAMetricsDict] = defaultdict(
        lambda: HPAMetricsDict()
    )

    if hpa.spec.metrics is not None:
        for metric_spec in hpa.spec.metrics:
            parsed = parser.parse_target(metric_spec)
            metrics_by_name[parsed["name"]].update(parsed)

    if hpa.status.current_metrics is not None:
        for metric_spec in hpa.status.current_metrics:
            parsed = parser.parse_current(metric_spec)
            if parsed is not None:
                metrics_by_name[parsed["name"]].update(parsed)

    metric_stats = list(metrics_by_name.values())

    last_scale_time = (
        hpa.status.last_scale_time.replace(tzinfo=pytz.UTC).isoformat()
        if getattr(hpa.status, "last_scale_time")
        else "N/A"
    )

    return KubernetesAutoscalingStatusDict(
        min_instances=hpa.spec.min_replicas,
        max_instances=hpa.spec.max_replicas,
        metrics=metric_stats,
        desired_replicas=hpa.status.desired_replicas,
        last_scale_time=last_scale_time,
    )
Ejemplo n.º 9
0
def check_under_replication(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    num_available: int,
    sub_component: Optional[str] = None,
) -> Tuple[bool, str]:
    """Check if a component/sub_component is under-replicated and returns both the result of the check in the form of a
    boolean and a human-readable text to be used in logging or monitoring events.
    """
    crit_threshold = instance_config.get_replication_crit_percentage()
    if sub_component is not None:
        output = (
            "Service %s has %d out of %d expected instances of %s available! (threshold: %d%%)"
        ) % (
            instance_config.job_id,
            num_available,
            expected_count,
            sub_component,
            crit_threshold,
        )
    else:
        output = (
            "Service %s has %d out of %d expected instances available! (threshold: %d%%)"
        ) % (instance_config.job_id, num_available, expected_count,
             crit_threshold)
    under_replicated, _ = is_under_replicated(num_available, expected_count,
                                              crit_threshold)
    if under_replicated:
        output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that the service PaaSTA can't keep the\n"
            "  requested number of copies up and healthy in the cluster.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply be unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            "service": instance_config.service,
            "instance": instance_config.instance,
            "cluster": instance_config.cluster,
        }
    return under_replicated, output
Ejemplo n.º 10
0
    def get_replication_for_instance(
        self, instance_config: LongRunningServiceConfig
    ) -> Dict[str, Dict[str, int]]:
        """Returns the number of registered instances in each discoverable location.

        :param instance_config: An instance of MarathonServiceConfig.
        :returns: a dict {'location_type': {'service.instance': int}}
        """
        replication_info = {}
        attribute_host_dict = self.get_allowed_locations_and_hosts(
            instance_config)
        instance_pool = instance_config.get_pool()
        for location, hosts in attribute_host_dict.items():
            hostname = self.get_first_host_in_pool(hosts, instance_pool)
            replication_info[location] = self._get_replication_info(
                location, hostname, instance_config)
        return replication_info
Ejemplo n.º 11
0
def autoscaling_status(
    kube_client: kubernetes_tools.KubeClient,
    job_config: LongRunningServiceConfig,
    namespace: str,
) -> KubernetesAutoscalingStatusDict:
    try:
        hpa = kube_client.autoscaling.read_namespaced_horizontal_pod_autoscaler(
            name=job_config.get_sanitised_deployment_name(),
            namespace=namespace)
    except ApiException as e:
        if e.status == 404:
            return KubernetesAutoscalingStatusDict(
                min_instances=-1,
                max_instances=-1,
                metrics=[],
                desired_replicas=-1,
                last_scale_time="unknown (could not find HPA object)",
            )
        else:
            raise

    # Parse metrics sources, based on
    # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V2beta1ExternalMetricSource.md#v2beta1externalmetricsource
    metric_stats = []
    parser = HPAMetricsParser(hpa)
    if hpa.spec.metrics is not None:
        for metric_spec in hpa.spec.metrics:
            metric_stats.append(parser.parse_target(metric_spec))
    if hpa.status.current_metrics is not None:
        for metric_spec in hpa.status.current_metrics:
            metric_stats.append(parser.parse_current(metric_spec))
    last_scale_time = (hpa.status.last_scale_time.replace(
        tzinfo=pytz.UTC).isoformat()
                       if getattr(hpa.status, "last_scale_time") else "N/A")

    return KubernetesAutoscalingStatusDict(
        min_instances=hpa.spec.min_replicas,
        max_instances=hpa.spec.max_replicas,
        metrics=metric_stats,
        desired_replicas=hpa.status.desired_replicas,
        last_scale_time=last_scale_time,
    )
Ejemplo n.º 12
0
    def get_allowed_locations_and_hosts(
        self, instance_config: LongRunningServiceConfig
    ) -> Dict[str, Sequence[DiscoveredHost]]:
        discover_location_type = kubernetes_tools.load_service_namespace_config(
            service=instance_config.service,
            namespace=instance_config.get_nerve_namespace(),
            soa_dir=instance_config.soa_dir,
        ).get_discover()

        attribute_to_nodes = kubernetes_tools.get_nodes_grouped_by_attribute(
            nodes=self.nodes, attribute=discover_location_type)
        ret: Dict[str, Sequence[DiscoveredHost]] = {}
        for attr, nodes in attribute_to_nodes.items():
            ret[attr] = [
                DiscoveredHost(
                    hostname=node.metadata.labels["yelp.com/hostname"],
                    pool=node.metadata.labels["yelp.com/pool"],
                ) for node in nodes
            ]
        return ret
Ejemplo n.º 13
0
async def job_status(
    kstatus: MutableMapping[str, Any],
    client: kubernetes_tools.KubeClient,
    job_config: LongRunningServiceConfig,
    pod_list: Sequence[V1Pod],
    replicaset_list: Sequence[V1ReplicaSet],
    verbose: int,
    namespace: str,
) -> None:
    app_id = job_config.get_sanitised_deployment_name()
    kstatus["app_id"] = app_id
    kstatus["pods"] = []
    kstatus["replicasets"] = []
    if verbose > 0:
        num_tail_lines = calculate_tail_lines(verbose)

        for pod in pod_list:
            container_statuses = pod.status.container_statuses or []
            containers = [
                dict(
                    name=container.name,
                    tail_lines=await get_tail_lines_for_kubernetes_container(
                        client, pod, container, num_tail_lines,
                    ),
                )
                for container in container_statuses
            ]
            kstatus["pods"].append(
                {
                    "name": pod.metadata.name,
                    "host": kubernetes_tools.get_pod_hostname(client, pod),
                    "deployed_timestamp": pod.metadata.creation_timestamp.timestamp(),
                    "phase": pod.status.phase,
                    "ready": kubernetes_tools.is_pod_ready(pod),
                    "containers": containers,
                    "reason": pod.status.reason,
                    "message": pod.status.message,
                }
            )
        for replicaset in replicaset_list:
            try:
                ready_replicas = replicaset.status.ready_replicas
                if ready_replicas is None:
                    ready_replicas = 0
            except AttributeError:
                ready_replicas = 0

            kstatus["replicasets"].append(
                {
                    "name": replicaset.metadata.name,
                    "replicas": replicaset.spec.replicas,
                    "ready_replicas": ready_replicas,
                    "create_timestamp": replicaset.metadata.creation_timestamp.timestamp(),
                }
            )

    kstatus["expected_instance_count"] = job_config.get_instances()

    app = kubernetes_tools.get_kubernetes_app_by_name(
        name=app_id, kube_client=client, namespace=namespace
    )
    deploy_status = kubernetes_tools.get_kubernetes_app_deploy_status(
        app=app, desired_instances=job_config.get_instances()
    )
    kstatus["deploy_status"] = kubernetes_tools.KubernetesDeployStatus.tostring(
        deploy_status
    )
    kstatus["running_instance_count"] = (
        app.status.ready_replicas if app.status.ready_replicas else 0
    )
    kstatus["create_timestamp"] = app.metadata.creation_timestamp.timestamp()
    kstatus["namespace"] = app.metadata.namespace
Ejemplo n.º 14
0
def mesh_status(
    service: str,
    service_mesh: ServiceMesh,
    instance: str,
    job_config: LongRunningServiceConfig,
    service_namespace_config: ServiceNamespaceConfig,
    pods: Sequence[V1Pod],
    settings: Any,
    should_return_individual_backends: bool = False,
) -> Mapping[str, Any]:

    registration = job_config.get_registrations()[0]
    instance_pool = job_config.get_pool()

    replication_checker = KubeSmartstackEnvoyReplicationChecker(
        nodes=kubernetes_tools.get_all_nodes(settings.kubernetes_client),
        system_paasta_config=settings.system_paasta_config,
    )
    node_hostname_by_location = replication_checker.get_allowed_locations_and_hosts(
        job_config)

    expected_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
        service=service,
        namespace=job_config.get_nerve_namespace(),
        cluster=settings.cluster,
        instance_type_class=KubernetesDeploymentConfig,
    )
    expected_count_per_location = int(expected_smartstack_count /
                                      len(node_hostname_by_location))
    mesh_status: MutableMapping[str, Any] = {
        "registration": registration,
        "expected_backends_per_location": expected_count_per_location,
        "locations": [],
    }

    for location, hosts in node_hostname_by_location.items():
        host = replication_checker.get_first_host_in_pool(hosts, instance_pool)
        if service_mesh == ServiceMesh.SMARTSTACK:
            mesh_status["locations"].append(
                _build_smartstack_location_dict(
                    synapse_host=host,
                    synapse_port=settings.system_paasta_config.
                    get_synapse_port(),
                    synapse_haproxy_url_format=settings.system_paasta_config.
                    get_synapse_haproxy_url_format(),
                    registration=registration,
                    pods=pods,
                    location=location,
                    should_return_individual_backends=
                    should_return_individual_backends,
                ))
        elif service_mesh == ServiceMesh.ENVOY:
            mesh_status["locations"].append(
                _build_envoy_location_dict(
                    envoy_host=host,
                    envoy_admin_port=settings.system_paasta_config.
                    get_envoy_admin_port(),
                    envoy_admin_endpoint_format=settings.system_paasta_config.
                    get_envoy_admin_endpoint_format(),
                    registration=registration,
                    pods=pods,
                    location=location,
                    should_return_individual_backends=
                    should_return_individual_backends,
                ))
    return mesh_status
Ejemplo n.º 15
0
def check_replication_for_instance(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    replication_checker: ReplicationChecker,
) -> bool:
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param instance_config: an instance of MarathonServiceConfig
    :param replication_checker: an instance of ReplicationChecker
    """

    crit_threshold = instance_config.get_replication_crit_percentage()

    log.info("Checking instance %s in service discovery providers",
             instance_config.job_id)
    replication_infos = replication_checker.get_replication_for_instance(
        instance_config)

    log.debug(
        f"Got replication info for {instance_config.job_id}: {replication_infos}"
    )
    if yelp_meteorite is not None:
        emit_replication_metrics(
            replication_infos,
            instance_config,
            expected_count,
        )

    combined_output = ""
    service_is_under_replicated = False
    failed_service_discovery_providers = set()
    for service_discovery_provider, replication_info in replication_infos.items(
    ):
        if len(replication_info) == 0:
            output = (
                "Service %s has no %s replication info. Make sure the discover key in the corresponding config (e.g. smartstack.yaml for Smartstack) is valid!\n"
            ) % (instance_config.job_id, service_discovery_provider)
            log.error(output)
            service_is_under_replicated = True
            failed_service_discovery_providers.add(service_discovery_provider)
        else:
            expected_count_per_location = int(expected_count /
                                              len(replication_info))
            output = ""
            output_critical = ""
            output_ok = ""
            under_replication_per_location = []

            for location, available_backends in sorted(
                    replication_info.items()):
                num_available_in_location = available_backends.get(
                    instance_config.job_id, 0)
                under_replicated, ratio = is_under_replicated(
                    num_available_in_location,
                    expected_count_per_location,
                    crit_threshold,
                )
                if under_replicated:
                    output_critical += (
                        "- Service %s has %d out of %d expected instances in %s according to %s (CRITICAL: %d%%)\n"
                        % (
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                    failed_service_discovery_providers.add(
                        service_discovery_provider)
                else:
                    output_ok += (
                        "- Service %s has %d out of %d expected instances in %s according to %s (OK: %d%%)\n"
                        % (
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                under_replication_per_location.append(under_replicated)

            output += output_critical
            if output_critical and output_ok:
                output += "\n\n"
                output += "The following locations are OK:\n"
            output += output_ok

            service_is_under_replicated_anywhere = any(
                under_replication_per_location)
            service_is_under_replicated |= service_is_under_replicated_anywhere
            if service_is_under_replicated_anywhere:
                log.error(output)
            else:
                log.info(output)
        combined_output += output

    if service_is_under_replicated:
        failed_service_discovery_providers_list = ",".join(
            failed_service_discovery_providers)
        combined_output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that a %(service_discovery_provider)s powered loadbalancer\n"
            "  doesn't have enough healthy backends. Not having enough healthy backends\n"
            "  means that clients of that service will get 503s (http) or connection refused\n"
            "  (tcp) when trying to connect to it.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply not have enough copies or it could simply be\n"
            "  unhealthy in that location. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * You can view the logs for the job with:\n"
            "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
            "\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
            "\n"
            "  * Widen %(service_discovery_provider)s discovery settings\n"
            "  * Increase the instance count\n"
            "\n") % {
                "service": instance_config.service,
                "instance": instance_config.instance,
                "cluster": instance_config.cluster,
                "service_discovery_provider":
                failed_service_discovery_providers_list,
            }
        status = pysensu_yelp.Status.CRITICAL
    else:
        status = pysensu_yelp.Status.OK

    send_replication_event(instance_config=instance_config,
                           status=status,
                           output=combined_output)

    return not service_is_under_replicated
Ejemplo n.º 16
0
def check_replication_for_instance(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    replication_checker: ReplicationChecker,
    dry_run: bool = False,
) -> bool:
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param instance_config: an instance of MarathonServiceConfig
    :param replication_checker: an instance of ReplicationChecker
    :param dry_run: Print Sensu event and metrics instead of emitting them
    """

    crit_threshold = instance_config.get_replication_crit_percentage()

    log.info("Checking instance %s in service discovery providers",
             instance_config.job_id)
    replication_infos = replication_checker.get_replication_for_instance(
        instance_config)

    log.debug(
        f"Got replication info for {instance_config.job_id}: {replication_infos}"
    )
    if yelp_meteorite is not None:
        emit_replication_metrics(
            replication_infos,
            instance_config,
            expected_count,
            dry_run=dry_run,
        )

    service_is_under_replicated = False
    failed_service_discovery_providers = set()
    for service_discovery_provider, replication_info in replication_infos.items(
    ):
        if len(replication_info) == 0:
            output = (
                "Service %s has no %s replication info. Make sure the discover key in the corresponding config (e.g. smartstack.yaml for Smartstack) is valid!\n"
            ) % (instance_config.job_id, service_discovery_provider)
            log.error(output)
            service_is_under_replicated = True
            failed_service_discovery_providers.add(service_discovery_provider)
        else:
            expected_count_per_location = int(expected_count /
                                              len(replication_info))
            output_critical = []
            output_ok = []
            under_replication_per_location = []

            for location, available_backends in sorted(
                    replication_info.items()):
                num_available_in_location = available_backends.get(
                    instance_config.job_id, 0)
                under_replicated, ratio = is_under_replicated(
                    num_available_in_location,
                    expected_count_per_location,
                    crit_threshold,
                )
                if under_replicated:
                    output_critical.append(
                        "{} has {}/{} replicas in {} according to {} (CRITICAL: {}%)\n"
                        .format(
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                    failed_service_discovery_providers.add(
                        service_discovery_provider)
                else:
                    output_ok.append(
                        "{} has {}/{} replicas in {} according to {} (OK: {}%)\n"
                        .format(
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                under_replication_per_location.append(under_replicated)

            output = ", ".join(output_critical)
            if output_critical and output_ok:
                output += ". The following locations are OK: "
            output += ", ".join(output_ok)

            service_is_under_replicated_anywhere = any(
                under_replication_per_location)
            service_is_under_replicated |= service_is_under_replicated_anywhere
            if service_is_under_replicated_anywhere:
                log.error(output)
            else:
                log.info(output)

    if service_is_under_replicated:
        failed_service_discovery_providers_list = ",".join(
            failed_service_discovery_providers)
        description = (
            "This replication alert means that a {service_discovery_provider} powered loadbalancer\n"
            "doesn't have enough healthy backends. Not having enough healthy backends\n"
            "means that clients of that service will get 503s (http) or connection refused\n"
            "(tcp) when trying to connect to it.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply not have enough copies or it could simply be\n"
            "  unhealthy in that location. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * You can view the logs for the job with:\n"
            "      paasta logs -s {service} -i {instance} -c {cluster}\n"
            "\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s {service} -i {instance} -c {cluster} -vv\n"
            "\n"
            "  * Widen {service_discovery_provider} discovery settings\n"
            "  * Increase the instance count\n"
            "\n"
        ).format(
            service=instance_config.service,
            instance=instance_config.instance,
            cluster=instance_config.cluster,
            service_discovery_provider=failed_service_discovery_providers_list,
        )
        status = pysensu_yelp.Status.CRITICAL
    else:
        description = ("{} is well-replicated because it has over {}% of its "
                       "expected replicas up.").format(instance_config.job_id,
                                                       crit_threshold)
        status = pysensu_yelp.Status.OK

    send_replication_event(
        instance_config=instance_config,
        status=status,
        output=output,
        description=description,
        dry_run=dry_run,
    )
    return not service_is_under_replicated