def end_maintenance_hosts(cluster): """usage: end_maintenance_hosts {--filename=filename | --hosts=hosts} cluster """ options = app.get_options() HostMaintenance(CLUSTERS[cluster], options.verbosity).end_maintenance( parse_hosts(options.filename, options.hosts))
def perform_maintenance_hosts(cluster): """usage: perform_maintenance_hosts {--filename=filename | --hosts=hosts} [--groups_per_batch=num] [--post_drain_script=path] [--grouping=function] cluster Asks the scheduler to remove any running tasks from the machine and remove it from service temporarily, perform some action on them, then return the machines to service. """ options = app.get_options() drainable_hosts = parse_hosts(options.filename, options.hosts) if options.post_drain_script: if not os.path.exists(options.post_drain_script): die("No such file: %s" % options.post_drain_script) cmd = os.path.abspath(options.post_drain_script) drained_callback = lambda host: subprocess.Popen([cmd, host]) else: drained_callback = None HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance( drainable_hosts, groups_per_batch=int(options.groups_per_batch), callback=drained_callback, grouping_function=options.grouping)
def host_maintenance_status(cluster): """usage: host_maintenance_status {--filename=filename | --hosts=hosts} cluster Check on the schedulers maintenance status for a list of hosts in the cluster. """ options = app.get_options() checkable_hosts = parse_hosts(options.filename, options.hosts) statuses = HostMaintenance(CLUSTERS[cluster], options.verbosity).check_status(checkable_hosts) for pair in statuses: log.info("%s is in state: %s" % pair)
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=filename] [--hosts=hosts] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hosts(options.filename, options.hosts) vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(hosts) probed_hosts = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), hosts) results = [] for host, job_details in sorted(probed_hosts.items()): results.append( "\n".join( [ "%s\t%s\t%.2f\t%s\t%s" % ( host, d.job.to_path(), d.predicted_percentage, d.safe, "n/a" if d.safe_in_secs is None else d.safe_in_secs, ) for d in sorted(job_details) ] ) ) print_results(results)