Ejemplo n.º 1
0
def test_load_smartstack_info_for_service(system_paasta_config):
    with mock.patch(
            'paasta_tools.smartstack_tools.marathon_tools.load_service_namespace_config',
            autospec=True,
    ), mock.patch(
            'paasta_tools.smartstack_tools.get_smartstack_replication_for_attribute',
            autospec=True,
    ):
        # just a smoke test for now.
        smartstack_tools.load_smartstack_info_for_service(
            service='service',
            namespace='namespace',
            soa_dir='fake',
            blacklist=[],
            system_paasta_config=system_paasta_config,
        )
Ejemplo n.º 2
0
def synapse_replication_is_low(service, instance, system_paasta_config, local_backends):
    crit_threshold = 80
    reg_svc, reg_namespace, _, __ = utils.decompose_job_id(
        read_registration_for_service_instance(
            service=service, instance=instance
        )
    )
    # We only actually care about the replication of where we're registering
    service, namespace = reg_svc, reg_namespace

    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(service=service, namespace=namespace)
    expected_count_per_location = int(expected_count / len(smartstack_replication_info))

    synapse_name = utils.compose_job_id(service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' % (
        service, instance, num_available, expected_count_per_location))
    return under_replicated
Ejemplo n.º 3
0
def synapse_replication_is_low(service, instance, system_paasta_config,
                               local_backends):
    crit_threshold = 80
    reg_svc, reg_namespace, _, __ = utils.decompose_job_id(
        read_registration_for_service_instance(service=service,
                                               instance=instance))
    # We only actually care about the replication of where we're registering
    service, namespace = reg_svc, reg_namespace

    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(
        service=service, namespace=namespace)
    expected_count_per_location = int(expected_count /
                                      len(smartstack_replication_info))

    synapse_name = utils.compose_job_id(service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.
        get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' %
             (service, instance, num_available, expected_count_per_location))
    return under_replicated
Ejemplo n.º 4
0
def synapse_replication_is_low(service, instance, system_paasta_config,
                               local_backends):
    crit_threshold = 80
    namespace = read_namespace_for_service_instance(service=service,
                                                    instance=instance)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(
        service=service, namespace=namespace)
    expected_count_per_location = int(expected_count /
                                      len(smartstack_replication_info))
    synapse_name = "%s.%s" % (service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.
        get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' %
             (service, instance, num_available, expected_count_per_location))
    return under_replicated
Ejemplo n.º 5
0
def test_load_smartstack_info_for_service():
    with contextlib.nested(
        mock.patch('paasta_tools.smartstack_tools.marathon_tools.load_service_namespace_config',
                   autospec=True),
        mock.patch('paasta_tools.smartstack_tools.get_smartstack_replication_for_attribute',
                   autospec=True),
    ) as (
        mock_load_service_namespace_config,
        mock_get_smartstack_replication_for_attribute,
    ):
        # just a smoke test for now.
        smartstack_tools.load_smartstack_info_for_service(
            service='service',
            namespace='namespace',
            soa_dir='fake',
            blacklist=[],
            system_paasta_config=SystemPaastaConfig({}, '/fake/config'),
        )
Ejemplo n.º 6
0
def test_load_smartstack_info_for_service():
    with contextlib.nested(
            mock.patch(
                'paasta_tools.smartstack_tools.marathon_tools.load_service_namespace_config',
                autospec=True),
            mock.patch(
                'paasta_tools.smartstack_tools.get_smartstack_replication_for_attribute',
                autospec=True),
    ) as (
            mock_load_service_namespace_config,
            mock_get_smartstack_replication_for_attribute,
    ):
        # just a smoke test for now.
        smartstack_tools.load_smartstack_info_for_service(
            service='service',
            namespace='namespace',
            soa_dir='fake',
            blacklist=[],
            system_paasta_config=SystemPaastaConfig({}, '/fake/config'),
        )
Ejemplo n.º 7
0
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    system_paasta_config,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param instance: A PaaSTA instance, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param system_paasta_config: A SystemPaastaConfig object representing the system configuration.
    """
    full_name = compose_job_id(service, instance)

    primary_registration = marathon_tools.read_registration_for_service_instance(
        service, instance, soa_dir=soa_dir)

    if primary_registration != full_name:
        log.debug('%s is announced under: %s. '
                  'Not checking replication for it' %
                  (full_name, primary_registration))
        return

    job_config = marathon_tools.load_marathon_service_config(
        service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=instance,
        soa_dir=soa_dir,
        blacklist=monitoring_blacklist,
        system_paasta_config=system_paasta_config,
    )
    log.debug('Got smartstack replication info for %s: %s' %
              (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = (
            'Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
            'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count /
                                          len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(
                smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location,
                crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location,
                    expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location,
                    expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * You can view the logs for the job with:\n"
                "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n") % {
                    'service': service,
                    'instance': instance,
                    'cluster': cluster,
                }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service,
               namespace=instance,
               cluster=cluster,
               soa_dir=soa_dir,
               status=status,
               output=output)
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    system_paasta_config,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param instance: A PaaSTA instance, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param system_paasta_config: A SystemPaastaConfig object representing the system configuration.
    """
    full_name = compose_job_id(service, instance)

    primary_registration = marathon_tools.read_registration_for_service_instance(
        service, instance, soa_dir=soa_dir
    )

    if primary_registration != full_name:
        log.debug(
            '%s is announced under: %s. '
            'Not checking replication for it' % (full_name, primary_registration)
        )
        return

    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=instance,
        soa_dir=soa_dir,
        blacklist=monitoring_blacklist,
        system_paasta_config=system_paasta_config,
    )
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * You can view the logs for the job with:\n"
                "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n"
            ) % {
                'service': service,
                'instance': instance,
                'cluster': cluster,
            }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)