Exemple #1
0
def synapse_replication_is_low(service, instance, system_paasta_config, local_backends):
    crit_threshold = 80
    reg_svc, reg_namespace, _, __ = utils.decompose_job_id(
        read_registration_for_service_instance(
            service=service, instance=instance
        )
    )
    # We only actually care about the replication of where we're registering
    service, namespace = reg_svc, reg_namespace

    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(service=service, namespace=namespace)
    expected_count_per_location = int(expected_count / len(smartstack_replication_info))

    synapse_name = utils.compose_job_id(service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' % (
        service, instance, num_available, expected_count_per_location))
    return under_replicated
Exemple #2
0
def synapse_replication_is_low(service, instance, system_paasta_config,
                               local_backends):
    crit_threshold = 80
    reg_svc, reg_namespace, _, __ = utils.decompose_job_id(
        read_registration_for_service_instance(service=service,
                                               instance=instance))
    # We only actually care about the replication of where we're registering
    service, namespace = reg_svc, reg_namespace

    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(
        service=service, namespace=namespace)
    expected_count_per_location = int(expected_count /
                                      len(smartstack_replication_info))

    synapse_name = utils.compose_job_id(service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.
        get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' %
             (service, instance, num_available, expected_count_per_location))
    return under_replicated
def status_smartstack_backends(service, instance, job_config, cluster, tasks, expected_count, soa_dir, verbose,
                               synapse_port, synapse_haproxy_url_format):
    """Returns detailed information about smartstack backends for a service
    and instance.
    return: A newline separated string of the smarststack backend status
    """
    output = []
    service_instance = marathon_tools.read_registration_for_service_instance(
        service, instance, cluster
    )

    service_namespace_config = marathon_tools.load_service_namespace_config(
        service=service, namespace=instance, soa_dir=soa_dir)
    discover_location_type = service_namespace_config.get_discover()
    monitoring_blacklist = job_config.get_monitoring_blacklist()

    filtered_slaves = get_all_slaves_for_blacklist_whitelist(
        blacklist=monitoring_blacklist,
        whitelist=[]
    )

    grouped_slaves = get_mesos_slaves_grouped_by_attribute(
        slaves=filtered_slaves,
        attribute=discover_location_type,
    )

    # rebuild the dict, replacing the slave object
    # with just their hostname
    grouped_slave_hostname = {
        attribute_value: [slave['hostname'] for slave in slaves]
        for attribute_value, slaves in grouped_slaves.items()
    }

    if len(grouped_slave_hostname) == 0:
        output.append("Smartstack: ERROR - %s is NOT in smartstack at all!" % service_instance)
    else:
        output.append("Smartstack:")
        if verbose:
            output.append("  Haproxy Service Name: %s" % service_instance)
            output.append("  Backends:")

        output.extend(pretty_print_smartstack_backends_for_locations(
            service_instance=service_instance,
            tasks=tasks,
            locations=grouped_slave_hostname,
            expected_count=expected_count,
            verbose=verbose,
            synapse_port=synapse_port,
            synapse_haproxy_url_format=synapse_haproxy_url_format,
        ))
    return "\n".join(output)
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    smartstack_replication_checker,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param instance: A PaaSTA instance, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param smartstack_replication_checker: an instance of SmartstackReplicationChecker
    """
    full_name = compose_job_id(service, instance)

    primary_registration = marathon_tools.read_registration_for_service_instance(
        service,
        instance,
        soa_dir=soa_dir,
        cluster=cluster,
    )

    if primary_registration != full_name:
        log.debug(
            '%s is announced under: %s. '
            'Not checking replication for it' %
            (full_name, primary_registration), )
        return

    job_config = marathon_tools.load_marathon_service_config(
        service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()

    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = smartstack_replication_checker.get_replication_for_instance(
        job_config)

    log.debug('Got smartstack replication info for %s: %s' %
              (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = (
            'Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
            'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count /
                                          len(smartstack_replication_info))
        output = ''
        output_critical = ''
        output_ok = ''
        under_replication_per_location = []

        for location, available_backends in sorted(
                smartstack_replication_info.items()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location,
                expected_count_per_location,
                crit_threshold,
            )
            if under_replicated:
                output_critical += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name,
                    num_available_in_location,
                    expected_count_per_location,
                    location,
                    ratio,
                )
            else:
                output_ok += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name,
                    num_available_in_location,
                    expected_count_per_location,
                    location,
                    ratio,
                )
            under_replication_per_location.append(under_replicated)

        output += output_critical
        if output_critical and output_ok:
            output += '\n\n'
            output += 'The following locations are OK:\n'
        output += output_ok

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * You can view the logs for the job with:\n"
                "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n") % {
                    'service': service,
                    'instance': instance,
                    'cluster': cluster,
                }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service,
               namespace=instance,
               cluster=cluster,
               soa_dir=soa_dir,
               status=status,
               output=output)
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    system_paasta_config,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param instance: A PaaSTA instance, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param system_paasta_config: A SystemPaastaConfig object representing the system configuration.
    """
    full_name = compose_job_id(service, instance)

    primary_registration = marathon_tools.read_registration_for_service_instance(
        service, instance, soa_dir=soa_dir
    )

    if primary_registration != full_name:
        log.debug(
            '%s is announced under: %s. '
            'Not checking replication for it' % (full_name, primary_registration)
        )
        return

    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=instance,
        soa_dir=soa_dir,
        blacklist=monitoring_blacklist,
        system_paasta_config=system_paasta_config,
    )
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * You can view the logs for the job with:\n"
                "      paasta logs -s %(service)s -i %(instance)s -c %(cluster)s\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n"
            ) % {
                'service': service,
                'instance': instance,
                'cluster': cluster,
            }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)