コード例 #1
0
ファイル: test_utils.py プロジェクト: ese/paasta
def test_is_under_replicated_zero():
    num_available = 1
    expected_count = 0
    crit_threshold = 50
    actual = utils.is_under_replicated(num_available, expected_count,
                                       crit_threshold)
    assert actual == (False, float(100))
コード例 #2
0
ファイル: test_utils.py プロジェクト: ese/paasta
def test_is_under_replicated_critical():
    num_available = 0
    expected_count = 1
    crit_threshold = 50
    actual = utils.is_under_replicated(num_available, expected_count,
                                       crit_threshold)
    assert actual == (True, float(0))
コード例 #3
0
ファイル: paasta_maintenance.py プロジェクト: somic/paasta
def synapse_replication_is_low(service, instance, system_paasta_config, local_backends):
    crit_threshold = 80
    reg_svc, reg_namespace, _, __ = utils.decompose_job_id(
        read_registration_for_service_instance(
            service=service, instance=instance
        )
    )
    # We only actually care about the replication of where we're registering
    service, namespace = reg_svc, reg_namespace

    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(service=service, namespace=namespace)
    expected_count_per_location = int(expected_count / len(smartstack_replication_info))

    synapse_name = utils.compose_job_id(service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' % (
        service, instance, num_available, expected_count_per_location))
    return under_replicated
コード例 #4
0
def send_event_if_under_replication(
    service,
    instance,
    cluster,
    expected_count,
    num_available,
    soa_dir,
):
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    output = ('Service %s has %d out of %d expected instances available!\n' +
              '(threshold: %d%%)') % (full_name, num_available, expected_count, crit_threshold)
    under_replicated, _ = is_under_replicated(num_available, expected_count, crit_threshold)
    if under_replicated:
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_event(
        service=service,
        namespace=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        status=status,
        output=output)
コード例 #5
0
def synapse_replication_is_low(service, instance, system_paasta_config,
                               local_backends):
    crit_threshold = 80
    namespace = read_namespace_for_service_instance(service=service,
                                                    instance=instance)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(
        service=service, namespace=namespace)
    expected_count_per_location = int(expected_count /
                                      len(smartstack_replication_info))
    synapse_name = "%s.%s" % (service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.
        get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' %
             (service, instance, num_available, expected_count_per_location))
    return under_replicated
コード例 #6
0
ファイル: paasta_maintenance.py プロジェクト: ycaihua/paasta
def synapse_replication_is_low(service, instance, system_paasta_config,
                               local_backends):
    crit_threshold = 80
    reg_svc, reg_namespace, _, __ = utils.decompose_job_id(
        read_registration_for_service_instance(service=service,
                                               instance=instance))
    # We only actually care about the replication of where we're registering
    service, namespace = reg_svc, reg_namespace

    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(
        service=service, namespace=namespace)
    expected_count_per_location = int(expected_count /
                                      len(smartstack_replication_info))

    synapse_name = utils.compose_job_id(service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.
        get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' %
             (service, instance, num_available, expected_count_per_location))
    return under_replicated
コード例 #7
0
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param namespace: A nerve namespace, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    """
    namespace = marathon_tools.read_namespace_for_service_instance(service, instance, soa_dir=soa_dir)
    if namespace != instance:
        log.debug("Instance %s is announced under namespace: %s. "
                  "Not checking replication for it" % (instance, namespace))
        return
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service, namespace=namespace, soa_dir=soa_dir, blacklist=monitoring_blacklist)
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)
コード例 #8
0
def send_replication_event_if_under_replication(
    instance_config,
    expected_count: int,
    num_available: int,
    sub_component: Optional[str] = None,
):
    crit_threshold = instance_config.get_replication_crit_percentage()
    if sub_component is not None:
        output = (
            "Service %s has %d out of %d expected instances of %s available!\n"
            + "(threshold: %d%%)"
        ) % (
            instance_config.job_id,
            num_available,
            expected_count,
            sub_component,
            crit_threshold,
        )
    else:
        output = (
            "Service %s has %d out of %d expected instances available!\n"
            + "(threshold: %d%%)"
        ) % (instance_config.job_id, num_available, expected_count, crit_threshold)
    under_replicated, _ = is_under_replicated(
        num_available, expected_count, crit_threshold
    )
    if under_replicated:
        output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that the service PaaSTA can't keep the\n"
            "  requested number of copies up and healthy in the cluster.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply be unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            "service": instance_config.service,
            "instance": instance_config.instance,
            "cluster": instance_config.cluster,
        }
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_replication_event(
        instance_config=instance_config, status=status, output=output
    )
コード例 #9
0
def check_under_replication(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    num_available: int,
    sub_component: Optional[str] = None,
) -> Tuple[bool, str, str]:
    """Check if a component/sub_component is under-replicated and returns both the result of the check in the form of a
    boolean and a human-readable text to be used in logging or monitoring events.
    """
    crit_threshold = instance_config.get_replication_crit_percentage()

    # Keep output short, with rest of context in description. This is because
    # by default, Slack-Sensu messages have a 400 char limit, incl. the output.
    # If it is too long, the runbook and tip won't show up.
    if sub_component is not None:
        output = ("{} has {}/{} replicas of {} available (threshold: {}%)").format(
            instance_config.job_id,
            num_available,
            expected_count,
            sub_component,
            crit_threshold,
        )
    else:
        output = ("{} has {}/{} replicas available (threshold: {}%)").format(
            instance_config.job_id, num_available, expected_count, crit_threshold
        )

    under_replicated, _ = is_under_replicated(
        num_available, expected_count, crit_threshold
    )
    if under_replicated:
        description = (
            "This replication alert means that PaaSTA can't keep the\n"
            "requested number of replicas up and healthy in the cluster for "
            "the instance {service}.{instance}.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply be unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s {service} -i {instance} -c {cluster} -vv\n"
        ).format(
            service=instance_config.service,
            instance=instance_config.instance,
            cluster=instance_config.cluster,
        )
    else:
        description = (
            "{} is well-replicated because it has over {}% of its "
            "expected replicas up."
        ).format(instance_config.job_id, crit_threshold)
    return under_replicated, output, description
コード例 #10
0
def send_event_if_under_replication(
    service,
    instance,
    cluster,
    expected_count,
    num_available,
    soa_dir,
):
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(
        service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    output = ('Service %s has %d out of %d expected instances available!\n' +
              '(threshold: %d%%)') % (full_name, num_available, expected_count,
                                      crit_threshold)
    under_replicated, _ = is_under_replicated(num_available, expected_count,
                                              crit_threshold)
    if under_replicated:
        output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that the service PaaSTA can't keep the\n"
            "  requested number of copies up and healthy in the cluster.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            'service': service,
            'instance': instance,
            'cluster': cluster,
        }
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_event(
        service=service,
        namespace=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        status=status,
        output=output,
    )
コード例 #11
0
def check_under_replication(
    instance_config,
    expected_count: int,
    num_available: int,
    sub_component: Optional[str] = None,
) -> Tuple[bool, str]:
    """Check if a component/sub_component is under-replicated and returns both the result of the check in the form of a
    boolean and a human-readable text to be used in logging or monitoring events.
    """
    crit_threshold = instance_config.get_replication_crit_percentage()
    if sub_component is not None:
        output = (
            "Service %s has %d out of %d expected instances of %s available!\n"
            + "(threshold: %d%%)"
        ) % (
            instance_config.job_id,
            num_available,
            expected_count,
            sub_component,
            crit_threshold,
        )
    else:
        output = (
            "Service %s has %d out of %d expected instances available!\n"
            + "(threshold: %d%%)"
        ) % (instance_config.job_id, num_available, expected_count, crit_threshold)
    under_replicated, _ = is_under_replicated(
        num_available, expected_count, crit_threshold
    )
    if under_replicated:
        output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that the service PaaSTA can't keep the\n"
            "  requested number of copies up and healthy in the cluster.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply be unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            "service": instance_config.service,
            "instance": instance_config.instance,
            "cluster": instance_config.cluster,
        }
    return (under_replicated, output)
コード例 #12
0
def send_event_if_under_replication(
    service,
    instance,
    cluster,
    expected_count,
    num_available,
    soa_dir,
):
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    output = ('Service %s has %d out of %d expected instances available!\n' +
              '(threshold: %d%%)') % (full_name, num_available, expected_count, crit_threshold)
    under_replicated, _ = is_under_replicated(num_available, expected_count, crit_threshold)
    if under_replicated:
        output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that the service PaaSTA can't keep the\n"
            "  requested number of copies up and healthy in the cluster.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            'service': service,
            'instance': instance,
            'cluster': cluster,
        }
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_event(
        service=service,
        namespace=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        status=status,
        output=output)
コード例 #13
0
def check_under_registered_taskmanagers(
    instance_config: FlinkDeploymentConfig,
    expected_count: int,
    cr_name: str,
) -> Tuple[bool, str, str]:
    """Check if not enough taskmanagers have been registered to the jobmanager and
    returns both the result of the check in the form of a boolean and a human-readable
    text to be used in logging or monitoring events.
    """
    unhealthy = True
    if cr_name != "":
        try:
            overview = flink_tools.get_flink_jobmanager_overview(
                cr_name, instance_config.cluster
            )
            num_reported = overview.get("taskmanagers", 0)
            crit_threshold = instance_config.get_replication_crit_percentage()
            output = (
                f"{instance_config.job_id} has {num_reported}/{expected_count} "
                f"taskmanagers reported by dashboard (threshold: {crit_threshold}%)"
            )
            unhealthy, _ = is_under_replicated(
                num_reported, expected_count, crit_threshold
            )
        except ValueError as e:
            output = (
                f"Dashboard of service {instance_config.job_id} is not available ({e})"
            )
    else:
        output = f"Dashboard of service {instance_config.job_id} is not available"
    if unhealthy:
        description = f"""
This alert means that the Flink dashboard is not reporting the expected
number of taskmanagers.

Reasons this might be happening:

  The service may simply be unhealthy. There also may not be enough resources
  in the cluster to support the requested instance count.

Things you can do:

  * Fix the cause of the unhealthy service. Try running:

     paasta status -s {instance_config.service} -i {instance_config.instance} -c {instance_config.cluster} -vv

"""
    else:
        description = f"{instance_config.job_id} taskmanager is available"
    return unhealthy, output, description
コード例 #14
0
def haproxy_backend_report(normal_instance_count, up_backends):
    """Given that a service is in smartstack, this returns a human readable
    report of the up backends"""
    # TODO: Take into account a configurable threshold, PAASTA-1102
    crit_threshold = 50
    under_replicated, ratio = is_under_replicated(num_available=up_backends,
                                                  expected_count=normal_instance_count,
                                                  crit_threshold=crit_threshold)
    if under_replicated:
        status = PaastaColors.red("Critical")
        count = PaastaColors.red("(%d/%d, %d%%)" % (up_backends, normal_instance_count, ratio))
    else:
        status = PaastaColors.green("Healthy")
        count = PaastaColors.green("(%d/%d)" % (up_backends, normal_instance_count))
    up_string = PaastaColors.bold('UP')
    return "%s - in haproxy with %s total backends %s in this namespace." % (status, count, up_string)
コード例 #15
0
def haproxy_backend_report(normal_instance_count, up_backends):
    """Given that a service is in smartstack, this returns a human readable
    report of the up backends"""
    # TODO: Take into account a configurable threshold, PAASTA-1102
    crit_threshold = 50
    under_replicated, ratio = is_under_replicated(num_available=up_backends,
                                                  expected_count=normal_instance_count,
                                                  crit_threshold=crit_threshold)
    if under_replicated:
        status = PaastaColors.red("Critical")
        count = PaastaColors.red("(%d/%d, %d%%)" % (up_backends, normal_instance_count, ratio))
    else:
        status = PaastaColors.green("Healthy")
        count = PaastaColors.green("(%d/%d)" % (up_backends, normal_instance_count))
    up_string = PaastaColors.bold('UP')
    return "%s - in haproxy with %s total backends %s in this namespace." % (status, count, up_string)
コード例 #16
0
def send_event_if_not_enough_taskmanagers(
    instance_config: FlinkDeploymentConfig,
    expected_count: int,
    num_reported: Optional[int],
    strerror: Optional[str],
) -> None:
    under_replicated = False
    if strerror is None:
        crit_threshold = instance_config.get_replication_crit_percentage()
        output = (
            "Service %s has %d out of %d expected instances of %s reported by dashboard!\n"
            + "(threshold: %d%%)") % (
                instance_config.job_id,
                num_reported,
                expected_count,
                "taskmanager",
                crit_threshold,
            )
        under_replicated, _ = is_under_replicated(num_reported, expected_count,
                                                  crit_threshold)
    else:
        output = ("Dashboard of service %s is not available!\n" + "(%s)") % (
            instance_config.job_id,
            strerror,
        )
    if under_replicated or strerror:
        output += _event_explanation()
        output += (
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            "service": instance_config.service,
            "instance": instance_config.instance,
            "cluster": instance_config.cluster,
        }
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_replication_event(instance_config=instance_config,
                           status=status,
                           output=output)
コード例 #17
0
ファイル: test_utils.py プロジェクト: neurogenesis/paasta
def test_is_under_replicated_critical():
    num_available = 0
    expected_count = 1
    crit_threshold = 50
    actual = utils.is_under_replicated(num_available, expected_count, crit_threshold)
    assert actual == (True, float(0))
コード例 #18
0
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    smartstack_replication_checker,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param instance: A PaaSTA instance, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param smartstack_replication_checker: an instance of SmartstackReplicationChecker
    """
    full_name = compose_job_id(service, instance)

    primary_registration = marathon_tools.read_registration_for_service_instance(
        service,
        instance,
        soa_dir=soa_dir,
        cluster=cluster,
    )

    if primary_registration != full_name:
        log.debug(
            '%s is announced under: %s. '
            'Not checking replication for it' %
            (full_name, primary_registration), )
        return

    job_config = marathon_tools.load_marathon_service_config(
        service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()

    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = smartstack_replication_checker.get_replication_for_instance(
        job_config)

    log.debug('Got smartstack replication info for %s: %s' %
              (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = (
            'Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
            'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count /
                                          len(smartstack_replication_info))
        output = ''
        output_critical = ''
        output_ok = ''
        under_replication_per_location = []

        for location, available_backends in sorted(
                smartstack_replication_info.items()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location,
                expected_count_per_location,
                crit_threshold,
            )
            if under_replicated:
                output_critical += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name,
                    num_available_in_location,
                    expected_count_per_location,
                    location,
                    ratio,
                )
            else:
                output_ok += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name,
                    num_available_in_location,
                    expected_count_per_location,
                    location,
                    ratio,
                )
            under_replication_per_location.append(under_replicated)

        output += output_critical
        if output_critical and output_ok:
            output += '\n\n'
            output += 'The following locations are OK:\n'
        output += output_ok

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * You can view the logs for the job with:\n"
                "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n") % {
                    'service': service,
                    'instance': instance,
                    'cluster': cluster,
                }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service,
               namespace=instance,
               cluster=cluster,
               soa_dir=soa_dir,
               status=status,
               output=output)
コード例 #19
0
def check_replication_for_instance(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    replication_checker: ReplicationChecker,
    dry_run: bool = False,
) -> bool:
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param instance_config: an instance of MarathonServiceConfig
    :param replication_checker: an instance of ReplicationChecker
    :param dry_run: Print Sensu event and metrics instead of emitting them
    """

    crit_threshold = instance_config.get_replication_crit_percentage()

    log.info("Checking instance %s in service discovery providers",
             instance_config.job_id)
    replication_infos = replication_checker.get_replication_for_instance(
        instance_config)

    log.debug(
        f"Got replication info for {instance_config.job_id}: {replication_infos}"
    )
    if yelp_meteorite is not None:
        emit_replication_metrics(
            replication_infos,
            instance_config,
            expected_count,
            dry_run=dry_run,
        )

    service_is_under_replicated = False
    failed_service_discovery_providers = set()
    for service_discovery_provider, replication_info in replication_infos.items(
    ):
        if len(replication_info) == 0:
            output = (
                "Service %s has no %s replication info. Make sure the discover key in the corresponding config (e.g. smartstack.yaml for Smartstack) is valid!\n"
            ) % (instance_config.job_id, service_discovery_provider)
            log.error(output)
            service_is_under_replicated = True
            failed_service_discovery_providers.add(service_discovery_provider)
        else:
            expected_count_per_location = int(expected_count /
                                              len(replication_info))
            output_critical = []
            output_ok = []
            under_replication_per_location = []

            for location, available_backends in sorted(
                    replication_info.items()):
                num_available_in_location = available_backends.get(
                    instance_config.job_id, 0)
                under_replicated, ratio = is_under_replicated(
                    num_available_in_location,
                    expected_count_per_location,
                    crit_threshold,
                )
                if under_replicated:
                    output_critical.append(
                        "{} has {}/{} replicas in {} according to {} (CRITICAL: {}%)\n"
                        .format(
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                    failed_service_discovery_providers.add(
                        service_discovery_provider)
                else:
                    output_ok.append(
                        "{} has {}/{} replicas in {} according to {} (OK: {}%)\n"
                        .format(
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                under_replication_per_location.append(under_replicated)

            output = ", ".join(output_critical)
            if output_critical and output_ok:
                output += ". The following locations are OK: "
            output += ", ".join(output_ok)

            service_is_under_replicated_anywhere = any(
                under_replication_per_location)
            service_is_under_replicated |= service_is_under_replicated_anywhere
            if service_is_under_replicated_anywhere:
                log.error(output)
            else:
                log.info(output)

    if service_is_under_replicated:
        failed_service_discovery_providers_list = ",".join(
            failed_service_discovery_providers)
        description = (
            "This replication alert means that a {service_discovery_provider} powered loadbalancer\n"
            "doesn't have enough healthy backends. Not having enough healthy backends\n"
            "means that clients of that service will get 503s (http) or connection refused\n"
            "(tcp) when trying to connect to it.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply not have enough copies or it could simply be\n"
            "  unhealthy in that location. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * You can view the logs for the job with:\n"
            "      paasta logs -s {service} -i {instance} -c {cluster}\n"
            "\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s {service} -i {instance} -c {cluster} -vv\n"
            "\n"
            "  * Widen {service_discovery_provider} discovery settings\n"
            "  * Increase the instance count\n"
            "\n"
        ).format(
            service=instance_config.service,
            instance=instance_config.instance,
            cluster=instance_config.cluster,
            service_discovery_provider=failed_service_discovery_providers_list,
        )
        status = pysensu_yelp.Status.CRITICAL
    else:
        description = ("{} is well-replicated because it has over {}% of its "
                       "expected replicas up.").format(instance_config.job_id,
                                                       crit_threshold)
        status = pysensu_yelp.Status.OK

    send_replication_event(
        instance_config=instance_config,
        status=status,
        output=output,
        description=description,
        dry_run=dry_run,
    )
    return not service_is_under_replicated
コード例 #20
0
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    system_paasta_config,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param namespace: A nerve namespace, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param system_paasta_config: A SystemPaastaConfig object representing the system configuration.
    """
    namespace = marathon_tools.read_namespace_for_service_instance(service, instance, soa_dir=soa_dir)
    if namespace != instance:
        log.debug("Instance %s is announced under namespace: %s. "
                  "Not checking replication for it" % (instance, namespace))
        return
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        soa_dir=soa_dir,
        blacklist=monitoring_blacklist,
        system_paasta_config=system_paasta_config,
    )
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n"
            ) % {
                'service': service,
                'instance': instance,
                'cluster': cluster,
            }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)
コード例 #21
0
def check_replication_for_instance(
    instance_config: LongRunningServiceConfig,
    expected_count: int,
    replication_checker: ReplicationChecker,
) -> bool:
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param instance_config: an instance of MarathonServiceConfig
    :param replication_checker: an instance of ReplicationChecker
    """

    crit_threshold = instance_config.get_replication_crit_percentage()

    log.info("Checking instance %s in service discovery providers",
             instance_config.job_id)
    replication_infos = replication_checker.get_replication_for_instance(
        instance_config)

    log.debug(
        f"Got replication info for {instance_config.job_id}: {replication_infos}"
    )
    if yelp_meteorite is not None:
        emit_replication_metrics(
            replication_infos,
            instance_config,
            expected_count,
        )

    combined_output = ""
    service_is_under_replicated = False
    failed_service_discovery_providers = set()
    for service_discovery_provider, replication_info in replication_infos.items(
    ):
        if len(replication_info) == 0:
            output = (
                "Service %s has no %s replication info. Make sure the discover key in the corresponding config (e.g. smartstack.yaml for Smartstack) is valid!\n"
            ) % (instance_config.job_id, service_discovery_provider)
            log.error(output)
            service_is_under_replicated = True
            failed_service_discovery_providers.add(service_discovery_provider)
        else:
            expected_count_per_location = int(expected_count /
                                              len(replication_info))
            output = ""
            output_critical = ""
            output_ok = ""
            under_replication_per_location = []

            for location, available_backends in sorted(
                    replication_info.items()):
                num_available_in_location = available_backends.get(
                    instance_config.job_id, 0)
                under_replicated, ratio = is_under_replicated(
                    num_available_in_location,
                    expected_count_per_location,
                    crit_threshold,
                )
                if under_replicated:
                    output_critical += (
                        "- Service %s has %d out of %d expected instances in %s according to %s (CRITICAL: %d%%)\n"
                        % (
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                    failed_service_discovery_providers.add(
                        service_discovery_provider)
                else:
                    output_ok += (
                        "- Service %s has %d out of %d expected instances in %s according to %s (OK: %d%%)\n"
                        % (
                            instance_config.job_id,
                            num_available_in_location,
                            expected_count_per_location,
                            location,
                            service_discovery_provider,
                            ratio,
                        ))
                under_replication_per_location.append(under_replicated)

            output += output_critical
            if output_critical and output_ok:
                output += "\n\n"
                output += "The following locations are OK:\n"
            output += output_ok

            service_is_under_replicated_anywhere = any(
                under_replication_per_location)
            service_is_under_replicated |= service_is_under_replicated_anywhere
            if service_is_under_replicated_anywhere:
                log.error(output)
            else:
                log.info(output)
        combined_output += output

    if service_is_under_replicated:
        failed_service_discovery_providers_list = ",".join(
            failed_service_discovery_providers)
        combined_output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that a %(service_discovery_provider)s powered loadbalancer\n"
            "  doesn't have enough healthy backends. Not having enough healthy backends\n"
            "  means that clients of that service will get 503s (http) or connection refused\n"
            "  (tcp) when trying to connect to it.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply not have enough copies or it could simply be\n"
            "  unhealthy in that location. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * You can view the logs for the job with:\n"
            "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
            "\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
            "\n"
            "  * Widen %(service_discovery_provider)s discovery settings\n"
            "  * Increase the instance count\n"
            "\n") % {
                "service": instance_config.service,
                "instance": instance_config.instance,
                "cluster": instance_config.cluster,
                "service_discovery_provider":
                failed_service_discovery_providers_list,
            }
        status = pysensu_yelp.Status.CRITICAL
    else:
        status = pysensu_yelp.Status.OK

    send_replication_event(instance_config=instance_config,
                           status=status,
                           output=combined_output)

    return not service_is_under_replicated
def check_smartstack_replication_for_instance(
        instance_config, expected_count,
        smartstack_replication_checker) -> bool:
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param instance_config: an instance of MarathonServiceConfig
    :param smartstack_replication_checker: an instance of SmartstackReplicationChecker
    """

    crit_threshold = instance_config.get_replication_crit_percentage()

    log.info("Checking instance %s in smartstack", instance_config.job_id)
    smartstack_replication_info = smartstack_replication_checker.get_replication_for_instance(
        instance_config)

    log.debug("Got smartstack replication info for %s: %s" %
              (instance_config.job_id, smartstack_replication_info))
    if yelp_meteorite is not None:
        emit_replication_metrics(
            smartstack_replication_info,
            instance_config,
            expected_count,
        )

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = (
            "Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml "
            "is valid!\n") % instance_config.job_id
        log.error(output)
        service_is_under_replicated = True
    else:
        expected_count_per_location = int(expected_count /
                                          len(smartstack_replication_info))
        output = ""
        output_critical = ""
        output_ok = ""
        under_replication_per_location = []

        for location, available_backends in sorted(
                smartstack_replication_info.items()):
            num_available_in_location = available_backends.get(
                instance_config.job_id, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location,
                crit_threshold)
            if under_replicated:
                output_critical += (
                    "- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n"
                    % (
                        instance_config.job_id,
                        num_available_in_location,
                        expected_count_per_location,
                        location,
                        ratio,
                    ))
            else:
                output_ok += (
                    "- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n"
                    % (
                        instance_config.job_id,
                        num_available_in_location,
                        expected_count_per_location,
                        location,
                        ratio,
                    ))
            under_replication_per_location.append(under_replicated)

        output += output_critical
        if output_critical and output_ok:
            output += "\n\n"
            output += "The following locations are OK:\n"
        output += output_ok

        service_is_under_replicated = any(under_replication_per_location)
        if service_is_under_replicated:
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * You can view the logs for the job with:\n"
                "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n") % {
                    "service": instance_config.service,
                    "instance": instance_config.instance,
                    "cluster": instance_config.cluster,
                }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_replication_event(instance_config=instance_config,
                           status=status,
                           output=output)

    return not service_is_under_replicated
コード例 #23
0
ファイル: test_utils.py プロジェクト: neurogenesis/paasta
def test_is_under_replicated_zero():
    num_available = 1
    expected_count = 0
    crit_threshold = 50
    actual = utils.is_under_replicated(num_available, expected_count, crit_threshold)
    assert actual == (False, float(100))