def end_maintenance_hosts(cluster):
    """usage: end_maintenance_hosts {--filename=filename | --hosts=hosts}
                                  cluster
  """
    options = app.get_options()
    HostMaintenance(CLUSTERS[cluster], options.verbosity).end_maintenance(
        parse_hostnames(options.filename, options.hosts)
    )
Example #2
0
def host_status(cluster):
    """usage: host_status {--filename=filename | --hosts=hosts}
                        cluster

  Print the drain status of each supplied host.
  """
    options = app.get_options()
    checkable_hosts = parse_hostnames(options.filename, options.hosts)
    statuses = HostMaintenance(CLUSTERS[cluster], options.verbosity).check_status(checkable_hosts)
    for pair in statuses:
        log.info("%s is in state: %s" % pair)
def host_maintenance_status(cluster):
    """usage: host_maintenance_status {--filename=filename | --hosts=hosts}
                                    cluster

  Check on the schedulers maintenance status for a list of hosts in the cluster.
  """
    options = app.get_options()
    checkable_hosts = parse_hostnames(options.filename, options.hosts)
    statuses = HostMaintenance(CLUSTERS[cluster], options.verbosity).check_status(checkable_hosts)
    for pair in statuses:
        log.info("%s is in state: %s" % pair)
def host_activate(cluster):
  """usage: host_activate {--filename=filename | --hosts=hosts}
                          cluster

  Removes maintenance mode from hosts.

  The list of hosts is marked as not in a drained state anymore. This will
  allow normal scheduling to resume on the given list of hosts.
  """
  options = app.get_options()
  HostMaintenance(CLUSTERS[cluster], options.verbosity).end_maintenance(
      parse_hostnames(options.filename, options.hosts))
Example #5
0
def host_activate(cluster):
    """usage: host_activate {--filename=filename | --hosts=hosts}
                          cluster

  Removes maintenance mode from hosts.

  The list of hosts is marked as not in a drained state anymore. This will
  allow normal scheduling to resume on the given list of hosts.
  """
    options = app.get_options()
    HostMaintenance(CLUSTERS[cluster], options.verbosity).end_maintenance(
        parse_hostnames(options.filename, options.hosts))
Example #6
0
def host_status(cluster):
    """usage: host_status {--filename=filename | --hosts=hosts}
                        cluster

  Print the drain status of each supplied host.
  """
    options = app.get_options()
    checkable_hosts = parse_hostnames(options.filename, options.hosts)
    statuses = HostMaintenance(CLUSTERS[cluster],
                               options.verbosity).check_status(checkable_hosts)
    for pair in statuses:
        log.info("%s is in state: %s" % pair)
def perform_maintenance_hosts(cluster):
    """usage: perform_maintenance_hosts {--filename=filename | --hosts=hosts}
                                      [--post_drain_script=path]
                                      [--grouping=function]
                                      [--override_percentage=percentage]
                                      [--override_duration=duration]
                                      [--override_reason=reason]
                                      [--unsafe_hosts_file=unsafe_hosts_filename]
                                      cluster

  Asks the scheduler to remove any running tasks from the machine and remove it
  from service temporarily, perform some action on them, then return the machines
  to service.
  """
    options = app.get_options()
    drainable_hosts = parse_hostnames(options.filename, options.hosts)
    get_grouping_or_die(options.grouping)

    has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason)
    all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason)
    if has_override != all_overrides:
        die("All --override_* options are required when attempting to override default SLA values.")

    percentage = parse_sla_percentage(options.percentage) if options.percentage else None
    duration = parse_time(options.duration) if options.duration else None
    if options.reason:
        log_admin_message(
            logging.WARNING,
            "Default SLA values (percentage: %s, duration: %s) are overridden for the following "
            "hosts: %s. New percentage: %s, duration: %s, override reason: %s"
            % (
                HostMaintenance.SLA_UPTIME_PERCENTAGE_LIMIT,
                HostMaintenance.SLA_UPTIME_DURATION_LIMIT,
                drainable_hosts,
                percentage,
                duration,
                options.reason,
            ),
        )

    drained_callback = parse_script(options.post_drain_script)

    HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance(
        drainable_hosts,
        grouping_function=options.grouping,
        callback=drained_callback,
        percentage=percentage,
        duration=duration,
        output_file=options.unsafe_hosts_filename,
    )
def host_deactivate(cluster):
  """usage: host_deactivate {--filename=filename | --hosts=hosts}
                            cluster

  Puts hosts into maintenance mode.

  The list of hosts is marked for maintenance, and will be de-prioritized
  from consideration for scheduling.  Note, they are not removed from
  consideration, and may still schedule tasks if resources are very scarce.
  Usually you would mark a larger set of machines for drain, and then do
  them in batches within the larger set, to help drained tasks not land on
  future hosts that will be drained shortly in subsequent batches.
  """
  options = app.get_options()
  HostMaintenance(CLUSTERS[cluster], options.verbosity).start_maintenance(
      parse_hostnames(options.filename, options.hosts))
Example #9
0
def host_deactivate(cluster):
    """usage: host_deactivate {--filename=filename | --hosts=hosts}
                            cluster

  Puts hosts into maintenance mode.

  The list of hosts is marked for maintenance, and will be de-prioritized
  from consideration for scheduling.  Note, they are not removed from
  consideration, and may still schedule tasks if resources are very scarce.
  Usually you would mark a larger set of machines for drain, and then do
  them in batches within the larger set, to help drained tasks not land on
  future hosts that will be drained shortly in subsequent batches.
  """
    options = app.get_options()
    HostMaintenance(CLUSTERS[cluster], options.verbosity).start_maintenance(
        parse_hostnames(options.filename, options.hosts))
def host_drain(cluster):
  """usage: host_drain {--filename=filename | --hosts=hosts}
                       [--post_drain_script=path]
                       [--grouping=function]
                       [--override_percentage=percentage]
                       [--override_duration=duration]
                       [--override_reason=reason]
                       [--unsafe_hosts_file=unsafe_hosts_filename]
                       cluster

  Asks the scheduler to start maintenance on the list of provided hosts (see host_deactivate
  for more details) and drains any active tasks on them.

  The list of hosts is drained and marked in a drained state.  This will kill
  off any tasks currently running on these hosts, as well as prevent future
  tasks from scheduling on these hosts while they are drained.

  The hosts are left in maintenance mode upon completion. Use host_activate to
  return hosts back to service and allow scheduling tasks on them.
  """
  options = app.get_options()
  drainable_hosts = parse_hostnames(options.filename, options.hosts)
  get_grouping_or_die(options.grouping)

  override_percentage, override_duration = parse_and_validate_sla_overrides(
      options,
      drainable_hosts)

  post_drain_callback = parse_script(options.post_drain_script)

  drained_hostnames = HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance(
      drainable_hosts,
      grouping_function=options.grouping,
      percentage=override_percentage,
      duration=override_duration,
      output_file=options.unsafe_hosts_filename)

  if post_drain_callback:
    for hostname in drained_hostnames:
      post_drain_callback(hostname)
Example #11
0
def host_drain(cluster):
    """usage: host_drain {--filename=filename | --hosts=hosts}
                       [--post_drain_script=path]
                       [--grouping=function]
                       [--override_percentage=percentage]
                       [--override_duration=duration]
                       [--override_reason=reason]
                       [--unsafe_hosts_file=unsafe_hosts_filename]
                       cluster

  Asks the scheduler to start maintenance on the list of provided hosts (see host_deactivate
  for more details) and drains any active tasks on them.

  The list of hosts is drained and marked in a drained state.  This will kill
  off any tasks currently running on these hosts, as well as prevent future
  tasks from scheduling on these hosts while they are drained.

  The hosts are left in maintenance mode upon completion. Use host_activate to
  return hosts back to service and allow scheduling tasks on them.
  """
    options = app.get_options()
    drainable_hosts = parse_hostnames(options.filename, options.hosts)
    get_grouping_or_die(options.grouping)

    override_percentage, override_duration = parse_and_validate_sla_overrides(
        options, drainable_hosts)

    post_drain_callback = parse_script(options.post_drain_script)

    drained_hostnames = HostMaintenance(
        CLUSTERS[cluster], options.verbosity).perform_maintenance(
            drainable_hosts,
            grouping_function=options.grouping,
            percentage=override_percentage,
            duration=override_duration,
            output_file=options.unsafe_hosts_filename)

    if post_drain_callback:
        for hostname in drained_hostnames:
            post_drain_callback(hostname)
Example #12
0
def sla_probe_hosts(cluster, percentage, duration):
    """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)
    hosts = parse_hostnames(options.filename, options.hosts)
    get_grouping_or_die(options.grouping)

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(
        options.min_instance_count, hosts
    )
    groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

    output, _ = format_sla_results(groups)
    print_results(output)
Example #13
0
def sla_probe_hosts(cluster, percentage, duration):
  """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
  options = app.get_options()

  sla_percentage = parse_sla_percentage(percentage)
  sla_duration = parse_time(duration)
  hosts = parse_hostnames(options.filename, options.hosts)
  get_grouping_or_die(options.grouping)

  vector = AuroraClientAPI(
      CLUSTERS[cluster],
      options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts)
  groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

  output, _ = format_sla_results(groups)
  print_results(output)