def _check_sla(self, hostnames, grouping_function, percentage, duration): """Check if the provided list of hosts passes the job uptime SLA check. This is an all-or-nothing check, meaning that all provided hosts must pass their job SLA check for the maintenance to proceed. :param hostnames: list of host names to check SLA for :type hostnames: list of strings :param grouping_function: grouping function to apply to the given hosts :type grouping_function: function :param percentage: SLA uptime percentage override :type percentage: float :param duration: SLA uptime duration override :type duration: twitter.common.quantity.Amount :rtype: set of unsafe hosts """ vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames) host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function) unsafe_hostnames = set() # Given that maintenance is performed 1 group at a time, any result longer than 1 group # should be considered a batch failure. if host_groups: if len(host_groups) > 1: log.error("Illegal multiple groups detected in SLA results. Skipping hosts: %s" % hostnames) return set(hostnames) results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True) if results: print_results(results) return unsafe_hostnames return unsafe_hostnames
def get_locks(cluster): """usage: get_locks cluster Prints all context/operation locks in the scheduler. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_locks() check_and_log_response(resp) pp = pprint.PrettyPrinter(indent=2) def pretty_print_lock(lock): return pp.pformat(vars(lock)) print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
def _check_sla(self, hostnames, grouping_function, percentage, duration): """Check if the provided list of hosts passes the job uptime SLA check. This is an all-or-nothing check, meaning that all provided hosts must pass their job SLA check for the maintenance to proceed. :param hostnames: list of host names to check SLA for :type hostnames: list of strings :param grouping_function: grouping function to apply to the given hosts :type grouping_function: function :param percentage: SLA uptime percentage override :type percentage: float :param duration: SLA uptime duration override :type duration: twitter.common.quantity.Amount :rtype: set of unsafe hosts """ vector = self._client.sla_get_safe_domain_vector( self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames) host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function) unsafe_hostnames = set() # Given that maintenance is performed 1 group at a time, any result longer than 1 group # should be considered a batch failure. if host_groups: if len(host_groups) > 1: log.error( 'Illegal multiple groups detected in SLA results. Skipping hosts: %s' % hostnames) return set(hostnames) results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True) if results: print_results(results) return unsafe_hostnames return unsafe_hostnames
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector( options.min_instance_count, hosts ) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI( CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)
def sla_list_safe_domain(cluster, percentage, duration): """usage: sla_list_safe_domain [--exclude_file=FILENAME] [--exclude_hosts=HOSTS] [--grouping=GROUPING] [--include_file=FILENAME] [--include_hosts=HOSTS] [--list_jobs] [--min_job_instance_count=COUNT] [--override_jobs=FILENAME] cluster percentage duration Returns a list of relevant hosts where it would be safe to kill tasks without violating their job SLA. The SLA is defined as a pair of percentage and duration, where: percentage - Percentage of tasks required to be up within the duration. Applied to all jobs except those listed in --override_jobs file; duration - Time interval (now - value) for the percentage of up tasks. Applied to all jobs except those listed in --override_jobs file. Format: XdYhZmWs (each field is optional but must be in that order.) Examples: 5m, 1d3h45m. NOTE: if --grouping option is specified and is set to anything other than default (by_host) the results will be processed and filtered based on the grouping function on a all-or-nothing basis. In other words, the group is 'safe' IFF it is safe to kill tasks on all hosts in the group at the same time. """ def parse_jobs_file(filename): result = {} with open(filename, "r") as overrides: for line in overrides: if not line.strip(): continue tokens = line.split() if len(tokens) != 3: die("Invalid line in %s:%s" % (filename, line)) job_key = AuroraJobKey.from_path(tokens[0]) result[job_key] = JobUpTimeLimit( job=job_key, percentage=parse_sla_percentage(tokens[1]), duration_secs=parse_time(tokens[2]).as_(Time.SECONDS), ) return result options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename) include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename) override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {} get_grouping_or_die(options.grouping) vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector( options.min_instance_count, include_hosts ) groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs, options.grouping) results = [] for group in groups: for host in sorted(group.keys()): if exclude_hosts and host in exclude_hosts: continue if options.list_jobs: results.append( "\n".join( [ "%s\t%s\t%.2f\t%d" % (host, d.job.to_path(), d.percentage, d.duration_secs) for d in sorted(group[host]) ] ) ) else: results.append("%s" % host) print_results(results)
def sla_list_safe_domain(cluster, percentage, duration): """usage: sla_list_safe_domain [--exclude_file=FILENAME] [--exclude_hosts=HOSTS] [--grouping=GROUPING] [--include_file=FILENAME] [--include_hosts=HOSTS] [--list_jobs] [--min_job_instance_count=COUNT] [--override_jobs=FILENAME] cluster percentage duration Returns a list of relevant hosts where it would be safe to kill tasks without violating their job SLA. The SLA is defined as a pair of percentage and duration, where: percentage - Percentage of tasks required to be up within the duration. Applied to all jobs except those listed in --override_jobs file; duration - Time interval (now - value) for the percentage of up tasks. Applied to all jobs except those listed in --override_jobs file. Format: XdYhZmWs (each field is optional but must be in that order.) Examples: 5m, 1d3h45m. NOTE: if --grouping option is specified and is set to anything other than default (by_host) the results will be processed and filtered based on the grouping function on a all-or-nothing basis. In other words, the group is 'safe' IFF it is safe to kill tasks on all hosts in the group at the same time. """ def parse_jobs_file(filename): result = {} with open(filename, 'r') as overrides: for line in overrides: if not line.strip(): continue tokens = line.split() if len(tokens) != 3: die('Invalid line in %s:%s' % (filename, line)) job_key = AuroraJobKey.from_path(tokens[0]) result[job_key] = JobUpTimeLimit( job=job_key, percentage=parse_sla_percentage(tokens[1]), duration_secs=parse_time(tokens[2]).as_(Time.SECONDS) ) return result options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename) include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename) override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {} get_grouping_or_die(options.grouping) vector = AuroraClientAPI( CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, include_hosts) groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs, options.grouping) results = [] for group in groups: for host in sorted(group.keys()): if exclude_hosts and host in exclude_hosts: continue if options.list_jobs: results.append('\n'.join(['%s\t%s\t%.2f\t%d' % (host, d.job.to_path(), d.percentage, d.duration_secs) for d in sorted(group[host])])) else: results.append('%s' % host) print_results(results)