def end_maintenance_hosts(cluster): """usage: end_maintenance_hosts {--filename=filename | --hosts=hosts} cluster """ options = app.get_options() HostMaintenance(CLUSTERS[cluster], options.verbosity).end_maintenance( parse_hostnames(options.filename, options.hosts) )
def host_status(cluster): """usage: host_status {--filename=filename | --hosts=hosts} cluster Print the drain status of each supplied host. """ options = app.get_options() checkable_hosts = parse_hostnames(options.filename, options.hosts) statuses = HostMaintenance(CLUSTERS[cluster], options.verbosity).check_status(checkable_hosts) for pair in statuses: log.info("%s is in state: %s" % pair)
def host_maintenance_status(cluster): """usage: host_maintenance_status {--filename=filename | --hosts=hosts} cluster Check on the schedulers maintenance status for a list of hosts in the cluster. """ options = app.get_options() checkable_hosts = parse_hostnames(options.filename, options.hosts) statuses = HostMaintenance(CLUSTERS[cluster], options.verbosity).check_status(checkable_hosts) for pair in statuses: log.info("%s is in state: %s" % pair)
def host_activate(cluster): """usage: host_activate {--filename=filename | --hosts=hosts} cluster Removes maintenance mode from hosts. The list of hosts is marked as not in a drained state anymore. This will allow normal scheduling to resume on the given list of hosts. """ options = app.get_options() HostMaintenance(CLUSTERS[cluster], options.verbosity).end_maintenance( parse_hostnames(options.filename, options.hosts))
def perform_maintenance_hosts(cluster): """usage: perform_maintenance_hosts {--filename=filename | --hosts=hosts} [--post_drain_script=path] [--grouping=function] [--override_percentage=percentage] [--override_duration=duration] [--override_reason=reason] [--unsafe_hosts_file=unsafe_hosts_filename] cluster Asks the scheduler to remove any running tasks from the machine and remove it from service temporarily, perform some action on them, then return the machines to service. """ options = app.get_options() drainable_hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason) all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason) if has_override != all_overrides: die("All --override_* options are required when attempting to override default SLA values.") percentage = parse_sla_percentage(options.percentage) if options.percentage else None duration = parse_time(options.duration) if options.duration else None if options.reason: log_admin_message( logging.WARNING, "Default SLA values (percentage: %s, duration: %s) are overridden for the following " "hosts: %s. New percentage: %s, duration: %s, override reason: %s" % ( HostMaintenance.SLA_UPTIME_PERCENTAGE_LIMIT, HostMaintenance.SLA_UPTIME_DURATION_LIMIT, drainable_hosts, percentage, duration, options.reason, ), ) drained_callback = parse_script(options.post_drain_script) HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance( drainable_hosts, grouping_function=options.grouping, callback=drained_callback, percentage=percentage, duration=duration, output_file=options.unsafe_hosts_filename, )
def host_deactivate(cluster): """usage: host_deactivate {--filename=filename | --hosts=hosts} cluster Puts hosts into maintenance mode. The list of hosts is marked for maintenance, and will be de-prioritized from consideration for scheduling. Note, they are not removed from consideration, and may still schedule tasks if resources are very scarce. Usually you would mark a larger set of machines for drain, and then do them in batches within the larger set, to help drained tasks not land on future hosts that will be drained shortly in subsequent batches. """ options = app.get_options() HostMaintenance(CLUSTERS[cluster], options.verbosity).start_maintenance( parse_hostnames(options.filename, options.hosts))
def host_drain(cluster): """usage: host_drain {--filename=filename | --hosts=hosts} [--post_drain_script=path] [--grouping=function] [--override_percentage=percentage] [--override_duration=duration] [--override_reason=reason] [--unsafe_hosts_file=unsafe_hosts_filename] cluster Asks the scheduler to start maintenance on the list of provided hosts (see host_deactivate for more details) and drains any active tasks on them. The list of hosts is drained and marked in a drained state. This will kill off any tasks currently running on these hosts, as well as prevent future tasks from scheduling on these hosts while they are drained. The hosts are left in maintenance mode upon completion. Use host_activate to return hosts back to service and allow scheduling tasks on them. """ options = app.get_options() drainable_hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) override_percentage, override_duration = parse_and_validate_sla_overrides( options, drainable_hosts) post_drain_callback = parse_script(options.post_drain_script) drained_hostnames = HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance( drainable_hosts, grouping_function=options.grouping, percentage=override_percentage, duration=override_duration, output_file=options.unsafe_hosts_filename) if post_drain_callback: for hostname in drained_hostnames: post_drain_callback(hostname)
def host_drain(cluster): """usage: host_drain {--filename=filename | --hosts=hosts} [--post_drain_script=path] [--grouping=function] [--override_percentage=percentage] [--override_duration=duration] [--override_reason=reason] [--unsafe_hosts_file=unsafe_hosts_filename] cluster Asks the scheduler to start maintenance on the list of provided hosts (see host_deactivate for more details) and drains any active tasks on them. The list of hosts is drained and marked in a drained state. This will kill off any tasks currently running on these hosts, as well as prevent future tasks from scheduling on these hosts while they are drained. The hosts are left in maintenance mode upon completion. Use host_activate to return hosts back to service and allow scheduling tasks on them. """ options = app.get_options() drainable_hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) override_percentage, override_duration = parse_and_validate_sla_overrides( options, drainable_hosts) post_drain_callback = parse_script(options.post_drain_script) drained_hostnames = HostMaintenance( CLUSTERS[cluster], options.verbosity).perform_maintenance( drainable_hosts, grouping_function=options.grouping, percentage=override_percentage, duration=override_duration, output_file=options.unsafe_hosts_filename) if post_drain_callback: for hostname in drained_hostnames: post_drain_callback(hostname)
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector( options.min_instance_count, hosts ) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI( CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)