예제 #1
0
    def _check_sla(self, hostnames, grouping_function, percentage, duration):
        """Check if the provided list of hosts passes the job uptime SLA check.

    This is an all-or-nothing check, meaning that all provided hosts must pass their job
    SLA check for the maintenance to proceed.

    :param hostnames: list of host names to check SLA for
    :type hostnames: list of strings
    :param grouping_function: grouping function to apply to the given hosts
    :type grouping_function: function
    :param percentage: SLA uptime percentage override
    :type percentage: float
    :param duration: SLA uptime duration override
    :type duration: twitter.common.quantity.Amount
    :rtype: set of unsafe hosts
    """
        vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
        host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function)

        unsafe_hostnames = set()
        # Given that maintenance is performed 1 group at a time, any result longer than 1 group
        # should be considered a batch failure.
        if host_groups:
            if len(host_groups) > 1:
                log.error("Illegal multiple groups detected in SLA results. Skipping hosts: %s" % hostnames)
                return set(hostnames)

            results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True)
            if results:
                print_results(results)
                return unsafe_hostnames

        return unsafe_hostnames
예제 #2
0
    def _check_sla(self, hostnames, grouping_function, percentage, duration):
        """Check if the provided list of hosts passes the job uptime SLA check.

    This is an all-or-nothing check, meaning that all provided hosts must pass their job
    SLA check for the maintenance to proceed.

    :param hostnames: list of host names to check SLA for
    :type hostnames: list of strings
    :param grouping_function: grouping function to apply to the given hosts
    :type grouping_function: function
    :param percentage: SLA uptime percentage override
    :type percentage: float
    :param duration: SLA uptime duration override
    :type duration: twitter.common.quantity.Amount
    :rtype: set of unsafe hosts
    """
        vector = self._client.sla_get_safe_domain_vector(
            self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
        host_groups = vector.probe_hosts(percentage,
                                         duration.as_(Time.SECONDS),
                                         grouping_function)

        unsafe_hostnames = set()
        # Given that maintenance is performed 1 group at a time, any result longer than 1 group
        # should be considered a batch failure.
        if host_groups:
            if len(host_groups) > 1:
                log.error(
                    'Illegal multiple groups detected in SLA results. Skipping hosts: %s'
                    % hostnames)
                return set(hostnames)

            results, unsafe_hostnames = format_sla_results(host_groups,
                                                           unsafe_only=True)
            if results:
                print_results(results)
                return unsafe_hostnames

        return unsafe_hostnames
예제 #3
0
def sla_probe_hosts(cluster, percentage, duration):
    """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)
    hosts = parse_hostnames(options.filename, options.hosts)
    get_grouping_or_die(options.grouping)

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(
        options.min_instance_count, hosts
    )
    groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

    output, _ = format_sla_results(groups)
    print_results(output)
예제 #4
0
def sla_probe_hosts(cluster, percentage, duration):
  """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
  options = app.get_options()

  sla_percentage = parse_sla_percentage(percentage)
  sla_duration = parse_time(duration)
  hosts = parse_hostnames(options.filename, options.hosts)
  get_grouping_or_die(options.grouping)

  vector = AuroraClientAPI(
      CLUSTERS[cluster],
      options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts)
  groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

  output, _ = format_sla_results(groups)
  print_results(output)