def perform_maintenance_hosts(cluster): """usage: perform_maintenance_hosts {--filename=filename | --hosts=hosts} [--post_drain_script=path] [--grouping=function] [--override_percentage=percentage] [--override_duration=duration] [--override_reason=reason] [--unsafe_hosts_file=unsafe_hosts_filename] cluster Asks the scheduler to remove any running tasks from the machine and remove it from service temporarily, perform some action on them, then return the machines to service. """ options = app.get_options() drainable_hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason) all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason) if has_override != all_overrides: die("All --override_* options are required when attempting to override default SLA values.") percentage = parse_sla_percentage(options.percentage) if options.percentage else None duration = parse_time(options.duration) if options.duration else None if options.reason: log_admin_message( logging.WARNING, "Default SLA values (percentage: %s, duration: %s) are overridden for the following " "hosts: %s. New percentage: %s, duration: %s, override reason: %s" % ( HostMaintenance.SLA_UPTIME_PERCENTAGE_LIMIT, HostMaintenance.SLA_UPTIME_DURATION_LIMIT, drainable_hosts, percentage, duration, options.reason, ), ) drained_callback = parse_script(options.post_drain_script) HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance( drainable_hosts, grouping_function=options.grouping, callback=drained_callback, percentage=percentage, duration=duration, output_file=options.unsafe_hosts_filename, )
def parse_jobs_file(filename): result = {} with open(filename, "r") as overrides: for line in overrides: if not line.strip(): continue tokens = line.split() if len(tokens) != 3: die("Invalid line in %s:%s" % (filename, line)) job_key = AuroraJobKey.from_path(tokens[0]) result[job_key] = JobUpTimeLimit( job=job_key, percentage=parse_sla_percentage(tokens[1]), duration_secs=parse_time(tokens[2]).as_(Time.SECONDS), ) return result
def parse_jobs_file(filename): result = {} with open(filename, 'r') as overrides: for line in overrides: if not line.strip(): continue tokens = line.split() if len(tokens) != 3: die('Invalid line in %s:%s' % (filename, line)) job_key = AuroraJobKey.from_path(tokens[0]) result[job_key] = JobUpTimeLimit( job=job_key, percentage=parse_sla_percentage(tokens[1]), duration_secs=parse_time(tokens[2]).as_(Time.SECONDS) ) return result
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector( options.min_instance_count, hosts ) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI( CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)
def sla_list_safe_domain(cluster, percentage, duration): """usage: sla_list_safe_domain [--exclude_file=FILENAME] [--exclude_hosts=HOSTS] [--grouping=GROUPING] [--include_file=FILENAME] [--include_hosts=HOSTS] [--list_jobs] [--min_job_instance_count=COUNT] [--override_jobs=FILENAME] cluster percentage duration Returns a list of relevant hosts where it would be safe to kill tasks without violating their job SLA. The SLA is defined as a pair of percentage and duration, where: percentage - Percentage of tasks required to be up within the duration. Applied to all jobs except those listed in --override_jobs file; duration - Time interval (now - value) for the percentage of up tasks. Applied to all jobs except those listed in --override_jobs file. Format: XdYhZmWs (each field is optional but must be in that order.) Examples: 5m, 1d3h45m. NOTE: if --grouping option is specified and is set to anything other than default (by_host) the results will be processed and filtered based on the grouping function on a all-or-nothing basis. In other words, the group is 'safe' IFF it is safe to kill tasks on all hosts in the group at the same time. """ def parse_jobs_file(filename): result = {} with open(filename, "r") as overrides: for line in overrides: if not line.strip(): continue tokens = line.split() if len(tokens) != 3: die("Invalid line in %s:%s" % (filename, line)) job_key = AuroraJobKey.from_path(tokens[0]) result[job_key] = JobUpTimeLimit( job=job_key, percentage=parse_sla_percentage(tokens[1]), duration_secs=parse_time(tokens[2]).as_(Time.SECONDS), ) return result options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename) include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename) override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {} get_grouping_or_die(options.grouping) vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector( options.min_instance_count, include_hosts ) groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs, options.grouping) results = [] for group in groups: for host in sorted(group.keys()): if exclude_hosts and host in exclude_hosts: continue if options.list_jobs: results.append( "\n".join( [ "%s\t%s\t%.2f\t%d" % (host, d.job.to_path(), d.percentage, d.duration_secs) for d in sorted(group[host]) ] ) ) else: results.append("%s" % host) print_results(results)
def sla_list_safe_domain(cluster, percentage, duration): """usage: sla_list_safe_domain [--exclude_file=FILENAME] [--exclude_hosts=HOSTS] [--grouping=GROUPING] [--include_file=FILENAME] [--include_hosts=HOSTS] [--list_jobs] [--min_job_instance_count=COUNT] [--override_jobs=FILENAME] cluster percentage duration Returns a list of relevant hosts where it would be safe to kill tasks without violating their job SLA. The SLA is defined as a pair of percentage and duration, where: percentage - Percentage of tasks required to be up within the duration. Applied to all jobs except those listed in --override_jobs file; duration - Time interval (now - value) for the percentage of up tasks. Applied to all jobs except those listed in --override_jobs file. Format: XdYhZmWs (each field is optional but must be in that order.) Examples: 5m, 1d3h45m. NOTE: if --grouping option is specified and is set to anything other than default (by_host) the results will be processed and filtered based on the grouping function on a all-or-nothing basis. In other words, the group is 'safe' IFF it is safe to kill tasks on all hosts in the group at the same time. """ def parse_jobs_file(filename): result = {} with open(filename, 'r') as overrides: for line in overrides: if not line.strip(): continue tokens = line.split() if len(tokens) != 3: die('Invalid line in %s:%s' % (filename, line)) job_key = AuroraJobKey.from_path(tokens[0]) result[job_key] = JobUpTimeLimit( job=job_key, percentage=parse_sla_percentage(tokens[1]), duration_secs=parse_time(tokens[2]).as_(Time.SECONDS) ) return result options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename) include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename) override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {} get_grouping_or_die(options.grouping) vector = AuroraClientAPI( CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, include_hosts) groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs, options.grouping) results = [] for group in groups: for host in sorted(group.keys()): if exclude_hosts and host in exclude_hosts: continue if options.list_jobs: results.append('\n'.join(['%s\t%s\t%.2f\t%d' % (host, d.job.to_path(), d.percentage, d.duration_secs) for d in sorted(group[host])])) else: results.append('%s' % host) print_results(results)