コード例 #1
0
ファイル: test_api.py プロジェクト: isabella232/client-3
 def mock_api(cls):
     api = AuroraClientAPI(Cluster(name="foo"), 'test-client')
     mock_proxy = create_autospec(spec=SchedulerProxyApiSpec,
                                  spec_set=True,
                                  instance=True)
     api._scheduler_proxy = mock_proxy
     return api, mock_proxy
コード例 #2
0
ファイル: admin.py プロジェクト: mkacik/incubator-aurora
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
    """usage: increase_quota cluster role cpu ram[unit] disk[unit]

  Increases the amount of production quota allocated to a user.
  """
    cpu = float(cpu_str)
    ram = parse_data(ram_str)
    disk = parse_data(disk_str)

    options = app.get_options()
    client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == "verbose")
    resp = client.get_quota(role)
    quota = resp.result.getQuotaResult.quota
    log.info(
        "Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB"
        % (role, quota.numCpus, quota.ramMb, quota.diskMb)
    )

    new_cpu = float(cpu + quota.numCpus)
    new_ram = int((ram + Amount(quota.ramMb, Data.MB)).as_(Data.MB))
    new_disk = int((disk + Amount(quota.diskMb, Data.MB)).as_(Data.MB))

    log.info(
        "Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB"
        % (role, new_cpu, new_ram, new_disk)
    )

    resp = client.set_quota(role, new_cpu, new_ram, new_disk)
    check_and_log_response(resp)
コード例 #3
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
  """usage: increase_quota cluster role cpu ram[unit] disk[unit]

  Increases the amount of production quota allocated to a user.
  """
  cpu = float(cpu_str)
  ram = parse_data(ram_str)
  disk = parse_data(disk_str)

  options = app.get_options()
  client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == 'verbose')
  resp = client.get_quota(role)
  quota = resp.result.getQuotaResult.quota
  log.info('Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' %
           (role, quota.numCpus, quota.ramMb, quota.diskMb))

  new_cpu = float(cpu + quota.numCpus)
  new_ram = int((ram + Amount(quota.ramMb, Data.MB)).as_(Data.MB))
  new_disk = int((disk + Amount(quota.diskMb, Data.MB)).as_(Data.MB))

  log.info('Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' %
           (role, new_cpu, new_ram, new_disk))

  resp = client.set_quota(role, new_cpu, new_ram, new_disk)
  check_and_log_response(resp)
コード例 #4
0
 def __init__(self,
              cluster,
              role,
              env,
              jobs,
              ssh_user=None,
              log_fn=log.log):
     self._cluster = cluster
     self._api = AuroraClientAPI(cluster=cluster)
     self._role = role
     self._env = env
     self._jobs = jobs
     self._ssh_user = ssh_user if ssh_user else self._role
     self._log = log_fn
コード例 #5
0
ファイル: admin.py プロジェクト: retrack/incubator-aurora
def sla_probe_hosts(cluster, percentage, duration):
    """usage: sla_probe_hosts
            [--filename=filename]
            [--hosts=hosts]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)
    hosts = parse_hosts(options.filename, options.hosts)

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(hosts)
    probed_hosts = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), hosts)

    results = []
    for host, job_details in sorted(probed_hosts.items()):
        results.append(
            "\n".join(
                [
                    "%s\t%s\t%.2f\t%s\t%s"
                    % (
                        host,
                        d.job.to_path(),
                        d.predicted_percentage,
                        d.safe,
                        "n/a" if d.safe_in_secs is None else d.safe_in_secs,
                    )
                    for d in sorted(job_details)
                ]
            )
        )

    print_results(results)
コード例 #6
0
 def __init__(self,
              cluster,
              role,
              env,
              jobs,
              ssh_user=None,
              log_fn=log.log):
     self._cluster = cluster
     self._api = AuroraClientAPI(cluster=cluster,
                                 user_agent=AURORA_V2_USER_AGENT_NAME)
     self._role = role
     self._env = env
     self._jobs = jobs
     self._ssh_user = ssh_user if ssh_user else self._role
     self._log = log_fn
コード例 #7
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def scheduler_backup_now(cluster):
  """usage: scheduler_backup_now cluster

  Immediately initiates a full storage backup.
  """
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).perform_backup())
コード例 #8
0
 def __init__(self, cluster, role, env, jobs, ssh_user=None):
     self._cluster = cluster
     self._api = AuroraClientAPI(cluster=cluster)
     self._role = role
     self._env = env
     self._jobs = jobs
     self._ssh_user = ssh_user if ssh_user else self._role
コード例 #9
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def scheduler_snapshot(cluster):
  """usage: scheduler_snapshot cluster

  Request that the scheduler perform a storage snapshot and block until complete.
  """
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).snapshot())
コード例 #10
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def scheduler_unload_recovery(cluster):
  """usage: scheduler_unload_recovery cluster

  Unloads a staged recovery.
  """
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
      .unload_recovery())
コード例 #11
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def scheduler_stage_recovery(cluster, backup_id):
  """usage: scheduler_stage_recovery cluster backup_id

  Stages a backup for recovery.
  """
  options = app.get_options()
  check_and_log_response(
      AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
コード例 #12
0
ファイル: test_context.py プロジェクト: bmhatfield/aurora
def test_handles_api_auth_error():
    context = AuroraCommandContext()

    mock_scheduler_proxy = mock.create_autospec(spec=SchedulerProxyApiSpec, instance=True)
    mock_scheduler_proxy.killTasks.side_effect = SchedulerProxy.AuthError()

    mock_api = AuroraClientAPI(TEST_CLUSTER, "user-agent")
    mock_api._scheduler_proxy = mock_scheduler_proxy

    context.apis = {TEST_CLUSTER.name: mock_api}
    api = context.get_api(TEST_CLUSTER.name, clusters={TEST_CLUSTER.name: TEST_CLUSTER})

    with pytest.raises(Context.CommandError) as e:
        api.kill_job(AuroraJobKey(TEST_CLUSTER.name, "role", "env", "job"))

    assert e.value.code == EXIT_AUTH_ERROR
    assert mock_scheduler_proxy.killTasks.call_count == 1
コード例 #13
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def scheduler_delete_recovery_tasks(cluster, task_ids):
  """usage: scheduler_delete_recovery_tasks cluster task_ids

  Deletes a comma-separated list of task IDs from a staged recovery.
  """
  ids = set(task_ids.split(','))
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
      .delete_recovery_tasks(TaskQuery(taskIds=ids)))
コード例 #14
0
def make_admin_client(cluster):
    if cluster not in CLUSTERS:
        die('Unknown cluster: %s. Known clusters: %s' %
            (cluster, ", ".join(CLUSTERS.keys())))

    verbose = getattr(app.get_options(), 'verbosity', 'normal') == 'verbose'
    return AuroraClientAPI(CLUSTERS[cluster],
                           AURORA_ADMIN_USER_AGENT_NAME,
                           verbose=verbose)
コード例 #15
0
ファイル: command_runner.py プロジェクト: rowoot/aurora
 def __init__(self, cluster, role, env, jobs, ssh_user=None, ssh_options=None, log_fn=log.log):
     self._cluster = cluster
     self._api = AuroraClientAPI(cluster=cluster, user_agent=AURORA_V2_USER_AGENT_NAME)
     self._role = role
     self._env = env
     self._jobs = jobs
     self._ssh_user = ssh_user if ssh_user else self._role
     self._ssh_options = ssh_options if ssh_options else []
     self._log = log_fn
コード例 #16
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def get_scheduler(cluster):
  """usage: get_scheduler CLUSTER

  Dumps the leading scheduler endpoint URL.
  """
  options = app.get_options()
  print("Found leading scheduler at: %s" % AuroraClientAPI(
      CLUSTERS[cluster],
      options.verbosity).scheduler_proxy.scheduler_client().raw_url)
コード例 #17
0
def test_handles_api_auth_error():
    context = AuroraCommandContext()

    mock_scheduler_proxy = mock.create_autospec(spec=SchedulerProxyApiSpec,
                                                instance=True)
    mock_scheduler_proxy.killTasks.side_effect = SchedulerProxy.AuthError()

    mock_api = AuroraClientAPI(TEST_CLUSTER, 'user-agent')
    mock_api._scheduler_proxy = mock_scheduler_proxy

    context.apis = {TEST_CLUSTER.name: mock_api}
    api = context.get_api(TEST_CLUSTER.name,
                          clusters={TEST_CLUSTER.name: TEST_CLUSTER})

    with pytest.raises(Context.CommandError) as e:
        api.kill_job(AuroraJobKey(TEST_CLUSTER.name, 'role', 'env', 'job'))

    assert e.value.code == EXIT_AUTH_ERROR
    assert mock_scheduler_proxy.killTasks.call_count == 1
コード例 #18
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def scheduler_list_backups(cluster):
  """usage: scheduler_list_backups cluster

  Lists backups available for recovery.
  """
  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups()
  check_and_log_response(resp)
  backups = resp.result.listBackupsResult.backups
  print('%s available backups:' % len(backups))
  for backup in backups:
    print(backup)
コード例 #19
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def get_locks(cluster):
  """usage: get_locks cluster

  Prints all context/operation locks in the scheduler.
  """
  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_locks()
  check_and_log_response(resp)

  pp = pprint.PrettyPrinter(indent=2)
  def pretty_print_lock(lock):
    return pp.pformat(vars(lock))

  print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
コード例 #20
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def sla_probe_hosts(cluster, percentage, duration):
  """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
  options = app.get_options()

  sla_percentage = parse_sla_percentage(percentage)
  sla_duration = parse_time(duration)
  hosts = parse_hostnames(options.filename, options.hosts)
  get_grouping_or_die(options.grouping)

  vector = AuroraClientAPI(
      CLUSTERS[cluster],
      options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts)
  groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

  output, _ = format_sla_results(groups)
  print_results(output)
コード例 #21
0
ファイル: admin.py プロジェクト: mkacik/incubator-aurora
def sla_probe_hosts(cluster, percentage, duration):
    """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)
    hosts = parse_hostnames(options.filename, options.hosts)
    get_grouping_or_die(options.grouping)

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(
        options.min_instance_count, hosts
    )
    groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

    output, _ = format_sla_results(groups)
    print_results(output)
コード例 #22
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def scheduler_print_recovery_tasks(cluster):
  """usage: scheduler_print_recovery_tasks cluster

  Prints all active tasks in a staged recovery.
  """
  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).query_recovery(
      TaskQuery(statuses=ACTIVE_STATES))
  check_and_log_response(resp)
  log.info('Role\tJob\tShard\tStatus\tTask ID')
  for task in resp.result.queryRecoveryResult.tasks:
    assigned = task.assignedTask
    conf = assigned.task
    log.info('\t'.join((conf.owner.role,
                        conf.jobName,
                        str(assigned.instanceId),
                        ScheduleStatus._VALUES_TO_NAMES[task.status],
                        assigned.taskId)))
コード例 #23
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def set_quota(cluster, role, cpu_str, ram, disk):
  """usage: set_quota cluster role cpu ram[MGT] disk[MGT]

  Alters the amount of production quota allocated to a user.
  """
  try:
    ram_size = parse_data(ram).as_(Data.MB)
    disk_size = parse_data(disk).as_(Data.MB)
  except ValueError as e:
    die(str(e))

  try:
    cpu = float(cpu_str)
    ram_mb = int(ram_size)
    disk_mb = int(disk_size)
  except ValueError as e:
    die(str(e))

  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).set_quota(role, cpu, ram_mb, disk_mb)
  check_and_log_response(resp)
コード例 #24
0
def make_admin_client(cluster, verbose=False, bypass_leader_redirect=False):
    """Creates an API client with the specified options for use in admin commands.

  :param cluster: The cluster to connect with.
  :type cluster: Either a string cluster name or a Cluster object.
  :param verbose: Should the client emit verbose output.
  :type verbose: bool
  :type bypass_leader_redirect: Should the client bypass the scheduler's leader redirect filter.
  :type bypass_leader_redirect: bool
  :rtype: an AuroraClientAPI instance.
  """

    is_cluster_object = isinstance(cluster, Cluster)

    if not is_cluster_object and cluster not in CLUSTERS:
        die('Unknown cluster: %s. Known clusters: %s' %
            (cluster, ", ".join(CLUSTERS.keys())))

    return AuroraClientAPI(cluster if is_cluster_object else CLUSTERS[cluster],
                           AURORA_ADMIN_USER_AGENT_NAME,
                           verbose=verbose,
                           bypass_leader_redirect=bypass_leader_redirect)
コード例 #25
0
 def __init__(self, cluster, verbosity, wait_event=None):
     self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
     self._wait_event = wait_event or Event()
コード例 #26
0
class HostMaintenance(object):
    """Submit requests to the scheduler to put hosts into and out of maintenance
  mode so they can be operated upon without causing LOST tasks.

  Aurora provides a two-tiered concept of Maintenance. The first step is to initiate maintenance,
  which will ask the Aurora scheduler to de-prioritize scheduling on a large set of hosts (the ones
  that will be operated upon during this maintenance window).  Once all hosts have been tagged in
  this manner, the operator can begin draining individual machines, which will have all user-tasks
  killed and rescheduled.  When the tasks get placed onto a new machine, the scheduler will first
  look for hosts that do not have the maintenance tag, which will help decrease churn and prevent a
  task from being constantly killed as its hosts go down from underneath it.
  """

    SLA_MIN_JOB_INSTANCE_COUNT = 20
    STATUS_POLL_INTERVAL = Amount(5, Time.SECONDS)
    MAX_STATUS_WAIT = Amount(5, Time.MINUTES)

    @classmethod
    def iter_batches(cls, hostnames, grouping_function=DEFAULT_GROUPING):
        groups = group_hosts(hostnames, grouping_function)
        groups = sorted(groups.items(), key=lambda v: v[0])
        for group in groups:
            yield Hosts(group[1])

    def __init__(self, cluster, verbosity, wait_event=None):
        self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
        self._wait_event = wait_event or Event()

    def _drain_hosts(self, drainable_hosts):
        """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :rtype: set of host names failed to drain
    """
        check_and_log_response(self._client.drain_hosts(drainable_hosts))
        drainable_hostnames = [
            hostname for hostname in drainable_hosts.hostNames
        ]

        total_wait = self.STATUS_POLL_INTERVAL
        not_drained_hostnames = set(drainable_hostnames)
        while not self._wait_event.is_set() and not_drained_hostnames:
            log.info('Waiting for hosts to be in DRAINED: %s' %
                     not_drained_hostnames)
            self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))

            statuses = self.check_status(list(not_drained_hostnames))
            not_drained_hostnames = set(h[0] for h in statuses
                                        if h[1] != 'DRAINED')

            total_wait += self.STATUS_POLL_INTERVAL
            if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
                log.warning(
                    'Failed to move all hosts into DRAINED within %s:\n%s' %
                    (self.MAX_STATUS_WAIT, '\n'.join("\tHost:%s\tStatus:%s" % h
                                                     for h in sorted(statuses)
                                                     if h[1] != 'DRAINED')))
                break

        return not_drained_hostnames

    def _complete_maintenance(self, drained_hosts):
        """End the maintenance status for a given set of hosts.

    :param drained_hosts: Hosts that are drained and finished being operated upon
    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
    """
        check_and_log_response(self._client.end_maintenance(drained_hosts))
        resp = self._client.maintenance_status(drained_hosts)
        for host_status in resp.result.maintenanceStatusResult.statuses:
            if host_status.mode != MaintenanceMode.NONE:
                log.warning('%s is DRAINING or in DRAINED' % host_status.host)

    def _check_sla(self, hostnames, grouping_function, percentage, duration):
        """Check if the provided list of hosts passes the job uptime SLA check.

    This is an all-or-nothing check, meaning that all provided hosts must pass their job
    SLA check for the maintenance to proceed.

    :param hostnames: list of host names to check SLA for
    :type hostnames: list of strings
    :param grouping_function: grouping function to apply to the given hosts
    :type grouping_function: function
    :param percentage: SLA uptime percentage override
    :type percentage: float
    :param duration: SLA uptime duration override
    :type duration: twitter.common.quantity.Amount
    :rtype: set of unsafe hosts
    """
        vector = self._client.sla_get_safe_domain_vector(
            self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
        host_groups = vector.probe_hosts(percentage,
                                         duration.as_(Time.SECONDS),
                                         grouping_function)

        unsafe_hostnames = set()
        # Given that maintenance is performed 1 group at a time, any result longer than 1 group
        # should be considered a batch failure.
        if host_groups:
            if len(host_groups) > 1:
                log.error(
                    'Illegal multiple groups detected in SLA results. Skipping hosts: %s'
                    % hostnames)
                return set(hostnames)

            results, unsafe_hostnames = format_sla_results(host_groups,
                                                           unsafe_only=True)
            if results:
                print_results(results)
                return unsafe_hostnames

        return unsafe_hostnames

    def end_maintenance(self, hostnames):
        """Pull a list of hostnames out of maintenance mode.

    :param hostnames: List of hosts to operate upon
    :type hostnames: list of strings
    """
        self._complete_maintenance(Hosts(set(hostnames)))

    def start_maintenance(self, hostnames):
        """Put a list of hostnames into maintenance mode, to de-prioritize scheduling.

    This is part of two-phase draining- tasks will still be running on these hosts until
    drain_hosts is called upon them.

    :param hostnames: List of hosts to set for initial maintenance
    :type hostnames: list of strings
    :rtype: list of hostnames with the maintenance mode set
    """
        resp = self._client.start_maintenance(Hosts(set(hostnames)))
        check_and_log_response(resp)
        result = [
            host_status.host
            for host_status in resp.result.startMaintenanceResult.statuses
        ]
        if len(result) != len(hostnames):
            log.warning('Skipping maintenance for unknown hosts: %s' %
                        (set(hostnames) - set(result)))

        return result

    def _operate_on_hosts(self, drained_hosts, callback):
        """Perform a given operation on a list of hosts that are ready for maintenance.

    :param drained_hosts: Hosts that have been drained (via _drain_hosts)
    :type drained_hosts: list of strings
    :param callback: Function to call one hostname at a time
    :type callback: function
    """
        for hostname in drained_hosts:
            callback(hostname)

    def perform_maintenance(self,
                            hostnames,
                            grouping_function=DEFAULT_GROUPING,
                            percentage=None,
                            duration=None,
                            output_file=None,
                            callback=None):
        """Put hosts into maintenance mode and drain them.

    Walk through the process of putting hosts into maintenance and draining them of tasks. The hosts
    will remain in maintenance mode upon completion.


    :param hostnames: A list of hostnames to operate upon
    :type hostnames: list of strings
    :param grouping_function: How to split up the hostname into groups
    :type grouping_function: function
    :param percentage: SLA percentage to use
    :type percentage: float
    :param duration: SLA duration to use
    :type duration: twitter.common.quantity.Time
    :param output_file: file to write hosts that were not drained due to failed SLA check
    :type output_file: string
    :param callback: Function to call once hosts are drained
    :type callback: function
    :rtype: set of host names that were successfully drained
    """
        hostnames = self.start_maintenance(hostnames)
        not_drained_hostnames = set()

        for hosts in self.iter_batches(hostnames, grouping_function):
            log.info('Beginning SLA check for %s' % hosts.hostNames)
            unsafe_hostnames = self._check_sla(list(hosts.hostNames),
                                               grouping_function, percentage,
                                               duration)

            if unsafe_hostnames:
                log.warning(
                    'Some hosts did not pass SLA check and will not be drained! '
                    'Skipping hosts: %s' % unsafe_hostnames)
                not_drained_hostnames |= unsafe_hostnames
                drainable_hostnames = hosts.hostNames - unsafe_hostnames
                if not drainable_hostnames:
                    continue
                hosts = Hosts(drainable_hostnames)
            else:
                log.info('All hosts passed SLA check.')

            not_drained_hostnames |= self._drain_hosts(hosts)

            if callback:
                self._operate_on_hosts(hosts.hostNames - not_drained_hostnames,
                                       callback)

        if not_drained_hostnames:
            output = '\n'.join(list(not_drained_hostnames))
            log.info(
                'The following hosts WERE NOT DRAINED due to failed SLA check or external failures:'
            )
            print(output)
            if output_file:
                try:
                    with open(output_file, 'w') as fp:
                        fp.write(output)
                        fp.write('\n')
                    log.info('Written unsafe host names into: %s' %
                             output_file)
                except IOError as e:
                    log.error('Failed to write into the output file: %s' % e)

        return set(hostnames) - not_drained_hostnames

    def check_status(self, hostnames):
        """Query the scheduler to determine the maintenance status for a list of hostnames

    :param hostnames: Hosts to query for
    :type hostnames: list of strings
    :rtype: list of 2-tuples, hostname and MaintenanceMode
    """
        resp = self._client.maintenance_status(Hosts(set(hostnames)))
        check_and_log_response(resp)
        statuses = []
        for host_status in resp.result.maintenanceStatusResult.statuses:
            statuses.append(
                (host_status.host,
                 MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
        return statuses
コード例 #27
0
ファイル: host_maintenance.py プロジェクト: rosmo/aurora
 def __init__(self, cluster, verbosity, wait_event=None):
     self._client = AuroraClientAPI(cluster, verbosity == "verbose")
     self._wait_event = wait_event or Event()
コード例 #28
0
ファイル: command_runner.py プロジェクト: kevints/aurora
class DistributedCommandRunner(object):
    @classmethod
    def make_executor_path(cls, cluster, executor_name):
        parameters = cls.sandbox_args(cluster)
        parameters.update(executor_name=executor_name)
        return (
            posixpath.join(
                "%(slave_root)s", "slaves/*/frameworks/*/executors/%(executor_name)s/runs", "%(slave_run_directory)s"
            )
            % parameters
        )

    @classmethod
    def thermos_sandbox(cls, cluster, executor_sandbox=False):
        sandbox = cls.make_executor_path(cluster, "thermos-{{thermos.task_id}}")
        return sandbox if executor_sandbox else posixpath.join(sandbox, "sandbox")

    @classmethod
    def sandbox_args(cls, cluster):
        cluster = cluster.with_trait(CommandRunnerTrait)
        return {"slave_root": cluster.slave_root, "slave_run_directory": cluster.slave_run_directory}

    @classmethod
    def substitute_thermos(cls, command, task, cluster, **kw):
        prefix_command = "cd %s;" % cls.thermos_sandbox(cluster, **kw)
        thermos_namespace = ThermosContext(task_id=task.assignedTask.taskId, ports=task.assignedTask.assignedPorts)
        mesos_namespace = MesosContext(instance=task.assignedTask.instanceId)
        command = String(prefix_command + command) % Environment(thermos=thermos_namespace, mesos=mesos_namespace)
        return command.get()

    @classmethod
    def aurora_sandbox(cls, cluster, executor_sandbox=False):
        if executor_sandbox:
            return cls.make_executor_path(cluster, "twitter")
        else:
            return "/var/run/nexus/%task_id%/sandbox"

    @classmethod
    def substitute_aurora(cls, command, task, cluster, **kw):
        command = ("cd %s;" % cls.aurora_sandbox(cluster, **kw)) + command
        command = command.replace("%shard_id%", str(task.assignedTask.instanceId))
        command = command.replace("%task_id%", task.assignedTask.taskId)
        for name, port in task.assignedTask.assignedPorts.items():
            command = command.replace("%port:" + name + "%", str(port))
        return command

    @classmethod
    def substitute(cls, command, task, cluster, **kw):
        if task.assignedTask.task.executorConfig:
            return cls.substitute_thermos(command, task, cluster, **kw)
        else:
            return cls.substitute_aurora(command, task, cluster, **kw)

    @classmethod
    def query_from(cls, role, env, job):
        return TaskQuery(statuses=LIVE_STATES, jobKeys=[JobKey(role=role, environment=env, name=job)])

    def __init__(self, cluster, role, env, jobs, ssh_user=None, log_fn=log.log):
        self._cluster = cluster
        self._api = AuroraClientAPI(cluster=cluster)
        self._role = role
        self._env = env
        self._jobs = jobs
        self._ssh_user = ssh_user if ssh_user else self._role
        self._log = log_fn

    def execute(self, args):
        hostname, role, command = args
        ssh_command = ["ssh", "-n", "-q", "%s@%s" % (role, hostname), command]
        self._log(logging.DEBUG, "Running command: %s" % ssh_command)
        po = subprocess.Popen(ssh_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        output = po.communicate()
        return "\n".join("%s:  %s" % (hostname, line) for line in output[0].splitlines())

    def resolve(self):
        for job in self._jobs:
            resp = self._api.query(self.query_from(self._role, self._env, job))
            if resp.responseCode != ResponseCode.OK:
                self._log(logging.ERROR, "Failed to query job: %s" % job)
                continue
            for task in resp.result.scheduleStatusResult.tasks:
                yield task

    def process_arguments(self, command, **kw):
        for task in self.resolve():
            host = task.assignedTask.slaveHost
            yield (host, self._ssh_user, self.substitute(command, task, self._cluster, **kw))

    def run(self, command, parallelism=1, **kw):
        threadpool = ThreadPool(processes=parallelism)
        for result in threadpool.imap_unordered(self.execute, self.process_arguments(command, **kw)):
            print(result)
コード例 #29
0
class HostMaintenance(object):
  """Submit requests to the scheduler to put hosts into and out of maintenance
  mode so they can be operated upon without causing LOST tasks.
  """

  DEFAULT_GROUPING = 'by_host'
  GROUPING_FUNCTIONS = {
    'by_host': group_by_host,
  }
  START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS)

  @classmethod
  def group_hosts(cls, hostnames, grouping_function=DEFAULT_GROUPING):
    try:
      grouping_function = cls.GROUPING_FUNCTIONS[grouping_function]
    except KeyError:
      raise ValueError('Unknown grouping function %s!' % grouping_function)
    groups = defaultdict(set)
    for hostname in hostnames:
      groups[grouping_function(hostname)].add(hostname)
    return groups

  @classmethod
  def iter_batches(cls, hostnames, groups_per_batch, grouping_function=DEFAULT_GROUPING):
    if groups_per_batch <= 0:
      raise ValueError('Batch size must be > 0!')
    groups = cls.group_hosts(hostnames, grouping_function)
    groups = sorted(groups.items(), key=lambda v: v[0])
    for k in range(0, len(groups), groups_per_batch):
      yield Hosts(set.union(*(hostset for (key, hostset) in groups[k:k + groups_per_batch])))

  def __init__(self, cluster, verbosity):
    self._client = AuroraClientAPI(cluster, verbosity == 'verbose')

  def _drain_hosts(self, drainable_hosts, clock=time):
    """This will actively turn down tasks running on hosts."""
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    not_ready_hosts = [hostname for hostname in drainable_hosts.hostNames]
    while not_ready_hosts:
      log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY)
      clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
      resp = self._client.maintenance_status(Hosts(not_ready_hosts))
      if not resp.result.maintenanceStatusResult.statuses:
        not_ready_hosts = None
      for host_status in resp.result.maintenanceStatusResult.statuses:
        if host_status.mode != MaintenanceMode.DRAINED:
          log.warning('%s is currently in status %s' %
              (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
        else:
          not_ready_hosts.remove(host_status.host)

  def _complete_maintenance(self, drained_hosts):
    """End the maintenance status for a give set of hosts."""
    check_and_log_response(self._client.end_maintenance(drained_hosts))
    resp = self._client.maintenance_status(drained_hosts)
    for host_status in resp.result.maintenanceStatusResult.statuses:
      if host_status.mode != MaintenanceMode.NONE:
        log.warning('%s is DRAINING or in DRAINED' % host_status.host)

  def _operate_on_hosts(self, drained_hosts, callback):
    """Perform a given operation on a list of hosts that are ready for maintenance."""
    for host in drained_hosts.hostNames:
      callback(host)

  def end_maintenance(self, hosts):
    """Pull a list of hosts out of maintenance mode."""
    self._complete_maintenance(Hosts(set(hosts)))

  def start_maintenance(self, hosts):
    """Put a list of hosts into maintenance mode, to de-prioritize scheduling."""
    check_and_log_response(self._client.start_maintenance(Hosts(set(hosts))))

  def perform_maintenance(self, hosts, groups_per_batch=1, grouping_function=DEFAULT_GROUPING,
                          callback=None):
    """The wrap a callback in between sending hosts into maintenance mode and back.

    Walk through the process of putting hosts into maintenance, draining them of tasks,
    performing an action on them once drained, then removing them from maintenance mode
    so tasks can schedule.
    """
    self._complete_maintenance(Hosts(set(hosts)))
    self.start_maintenance(hosts)

    for hosts in self.iter_batches(hosts, groups_per_batch, grouping_function):
      self._drain_hosts(hosts)
      if callback:
        self._operate_on_hosts(hosts, callback)
      self._complete_maintenance(hosts)

  def check_status(self, hosts):
    resp = self._client.maintenance_status(Hosts(set(hosts)))
    check_and_log_response(resp)
    statuses = []
    for host_status in resp.result.maintenanceStatusResult.statuses:
      statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
    return statuses
コード例 #30
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def query(args, options):
  """usage: query [--force]
                  [--listformat=FORMAT]
                  [--shards=N[,N,...]]
                  [--states=State[,State,...]]
                  cluster [role [job]]

  Query Mesos about jobs and tasks.
  """
  def _convert_fmt_string(fmtstr):
    import re
    def convert(match):
      return "%%(%s)s" % match.group(1)
    return re.sub(r'%(\w+)%', convert, fmtstr)

  def flatten_task(t, d={}):
    for key in t.__dict__.keys():
      val = getattr(t, key)
      try:
        val.__dict__.keys()
      except AttributeError:
        d[key] = val
      else:
        flatten_task(val, d)

    return d

  def map_values(d):
    default_value = lambda v: v
    mapping = {
      'status': lambda v: ScheduleStatus._VALUES_TO_NAMES[v],
    }
    return dict(
      (k, mapping.get(k, default_value)(v)) for (k, v) in d.items()
    )

  for state in options.states.split(','):
    if state not in ScheduleStatus._NAMES_TO_VALUES:
      msg = "Unknown state '%s' specified.  Valid states are:\n" % state
      msg += ','.join(ScheduleStatus._NAMES_TO_VALUES.keys())
      die(msg)

  # Role, Job, Instances, States, and the listformat
  if len(args) == 0:
    die('Must specify at least cluster.')

  cluster = args[0]
  role = args[1] if len(args) > 1 else None
  job = args[2] if len(args) > 2 else None
  instances = set(map(int, options.shards.split(','))) if options.shards else set()

  if options.states:
    states = set(map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(',')))
  else:
    states = ACTIVE_STATES | TERMINAL_STATES
  listformat = _convert_fmt_string(options.listformat)

  #  Figure out "expensive" queries here and bone if they do not have --force
  #  - Does not specify role
  if not role and not options.force:
    die('--force is required for expensive queries (no role specified)')

  #  - Does not specify job
  if not job and not options.force:
    die('--force is required for expensive queries (no job specified)')

  #  - Specifies status outside of ACTIVE_STATES
  if not (states <= ACTIVE_STATES) and not options.force:
    die('--force is required for expensive queries (states outside ACTIVE states')

  api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
  query_info = api.query(api.build_query(role, job, instances=instances, statuses=states))
  if query_info.responseCode != ResponseCode.OK:
    die('Failed to query scheduler: %s' % query_info.messageDEPRECATED)

  tasks = query_info.result.scheduleStatusResult.tasks
  if tasks is None:
    return

  try:
    for task in tasks:
      d = flatten_task(task)
      print(listformat % map_values(d))
  except KeyError:
    msg = "Unknown key in format string.  Valid keys are:\n"
    msg += ','.join(d.keys())
    die(msg)
コード例 #31
0
ファイル: test_api.py プロジェクト: kevints/aurora
 def mock_api(cls):
     api = AuroraClientAPI(Cluster(name="foo"))
     mock_proxy = Mock()
     api._scheduler_proxy = mock_proxy
     return api, mock_proxy
コード例 #32
0
ファイル: admin.py プロジェクト: bhuvan/incubator-aurora
def sla_list_safe_domain(cluster, percentage, duration):
  """usage: sla_list_safe_domain
            [--exclude_file=FILENAME]
            [--exclude_hosts=HOSTS]
            [--grouping=GROUPING]
            [--include_file=FILENAME]
            [--include_hosts=HOSTS]
            [--list_jobs]
            [--min_job_instance_count=COUNT]
            [--override_jobs=FILENAME]
            cluster percentage duration

  Returns a list of relevant hosts where it would be safe to kill
  tasks without violating their job SLA. The SLA is defined as a pair of
  percentage and duration, where:

  percentage - Percentage of tasks required to be up within the duration.
  Applied to all jobs except those listed in --override_jobs file;

  duration - Time interval (now - value) for the percentage of up tasks.
  Applied to all jobs except those listed in --override_jobs file.
  Format: XdYhZmWs (each field is optional but must be in that order.)
  Examples: 5m, 1d3h45m.

  NOTE: if --grouping option is specified and is set to anything other than
        default (by_host) the results will be processed and filtered based
        on the grouping function on a all-or-nothing basis. In other words,
        the group is 'safe' IFF it is safe to kill tasks on all hosts in the
        group at the same time.
  """
  def parse_jobs_file(filename):
    result = {}
    with open(filename, 'r') as overrides:
      for line in overrides:
        if not line.strip():
          continue

        tokens = line.split()
        if len(tokens) != 3:
          die('Invalid line in %s:%s' % (filename, line))
        job_key = AuroraJobKey.from_path(tokens[0])
        result[job_key] = JobUpTimeLimit(
            job=job_key,
            percentage=parse_sla_percentage(tokens[1]),
            duration_secs=parse_time(tokens[2]).as_(Time.SECONDS)
        )
    return result

  options = app.get_options()

  sla_percentage = parse_sla_percentage(percentage)
  sla_duration = parse_time(duration)

  exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename)
  include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename)
  override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}
  get_grouping_or_die(options.grouping)

  vector = AuroraClientAPI(
      CLUSTERS[cluster],
      options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, include_hosts)
  groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS),
      override_jobs, options.grouping)

  results = []
  for group in groups:
    for host in sorted(group.keys()):
      if exclude_hosts and host in exclude_hosts:
        continue

      if options.list_jobs:
        results.append('\n'.join(['%s\t%s\t%.2f\t%d' %
            (host, d.job.to_path(), d.percentage, d.duration_secs) for d in sorted(group[host])]))
      else:
        results.append('%s' % host)

  print_results(results)
コード例 #33
0
class DistributedCommandRunner(object):
  @staticmethod
  def execute(args):
    hostname, role, command = args
    ssh_command = ['ssh', '-n', '-q', '%s@%s' % (role, hostname), command]
    po = subprocess.Popen(ssh_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    output = po.communicate()
    return '\n'.join('%s:  %s' % (hostname, line) for line in output[0].splitlines())

  @classmethod
  def make_executor_path(cls, cluster, executor_name):
    parameters = cls.sandbox_args(cluster)
    parameters.update(executor_name=executor_name)
    return posixpath.join(
        '%(slave_root)s',
        'slaves/*/frameworks/*/executors/%(executor_name)s/runs',
        '%(slave_run_directory)s'
    ) % parameters

  @classmethod
  def thermos_sandbox(cls, cluster, executor_sandbox=False):
    sandbox = cls.make_executor_path(cluster, 'thermos-{{thermos.task_id}}')
    return sandbox if executor_sandbox else posixpath.join(sandbox, 'sandbox')

  @classmethod
  def sandbox_args(cls, cluster):
    cluster = cluster.with_trait(CommandRunnerTrait)
    return {'slave_root': cluster.slave_root, 'slave_run_directory': cluster.slave_run_directory}

  @classmethod
  def substitute_thermos(cls, command, task, cluster, **kw):
    prefix_command = 'cd %s;' % cls.thermos_sandbox(cluster, **kw)
    thermos_namespace = ThermosContext(
        task_id=task.assignedTask.taskId,
        ports=task.assignedTask.assignedPorts)
    mesos_namespace = MesosContext(instance=task.assignedTask.instanceId)
    command = String(prefix_command + command) % Environment(
        thermos=thermos_namespace,
        mesos=mesos_namespace)
    return command.get()

  @classmethod
  def aurora_sandbox(cls, cluster, executor_sandbox=False):
    if executor_sandbox:
      return cls.make_executor_path(cluster, 'twitter')
    else:
      return '/var/run/nexus/%task_id%/sandbox'

  @classmethod
  def substitute_aurora(cls, command, task, cluster, **kw):
    command = ('cd %s;' % cls.aurora_sandbox(cluster, **kw)) + command
    command = command.replace('%shard_id%', str(task.assignedTask.instanceId))
    command = command.replace('%task_id%', task.assignedTask.taskId)
    for name, port in task.assignedTask.assignedPorts.items():
      command = command.replace('%port:' + name + '%', str(port))
    return command

  @classmethod
  def substitute(cls, command, task, cluster, **kw):
    if task.assignedTask.task.executorConfig:
      return cls.substitute_thermos(command, task, cluster, **kw)
    else:
      return cls.substitute_aurora(command, task, cluster, **kw)

  @classmethod
  def query_from(cls, role, env, job):
    return TaskQuery(statuses=LIVE_STATES, owner=Identity(role), jobName=job, environment=env)

  def __init__(self, cluster, role, env, jobs, ssh_user=None):
    self._cluster = cluster
    self._api = AuroraClientAPI(cluster=cluster)
    self._role = role
    self._env = env
    self._jobs = jobs
    self._ssh_user = ssh_user if ssh_user else self._role

  def resolve(self):
    for job in self._jobs:
      resp = self._api.query(self.query_from(self._role, self._env, job))
      if resp.responseCode != ResponseCode.OK:
        log.error('Failed to query job: %s' % job)
        continue
      for task in resp.result.scheduleStatusResult.tasks:
        yield task

  def process_arguments(self, command, **kw):
    for task in self.resolve():
      host = task.assignedTask.slaveHost
      role = task.assignedTask.task.owner.role
      yield (host, self._ssh_user, self.substitute(command, task, self._cluster, **kw))

  def run(self, command, parallelism=1, **kw):
    threadpool = ThreadPool(processes=parallelism)
    for result in threadpool.imap_unordered(self.execute, self.process_arguments(command, **kw)):
      print result
コード例 #34
0
ファイル: admin.py プロジェクト: mkacik/incubator-aurora
def sla_list_safe_domain(cluster, percentage, duration):
    """usage: sla_list_safe_domain
            [--exclude_file=FILENAME]
            [--exclude_hosts=HOSTS]
            [--grouping=GROUPING]
            [--include_file=FILENAME]
            [--include_hosts=HOSTS]
            [--list_jobs]
            [--min_job_instance_count=COUNT]
            [--override_jobs=FILENAME]
            cluster percentage duration

  Returns a list of relevant hosts where it would be safe to kill
  tasks without violating their job SLA. The SLA is defined as a pair of
  percentage and duration, where:

  percentage - Percentage of tasks required to be up within the duration.
  Applied to all jobs except those listed in --override_jobs file;

  duration - Time interval (now - value) for the percentage of up tasks.
  Applied to all jobs except those listed in --override_jobs file.
  Format: XdYhZmWs (each field is optional but must be in that order.)
  Examples: 5m, 1d3h45m.

  NOTE: if --grouping option is specified and is set to anything other than
        default (by_host) the results will be processed and filtered based
        on the grouping function on a all-or-nothing basis. In other words,
        the group is 'safe' IFF it is safe to kill tasks on all hosts in the
        group at the same time.
  """

    def parse_jobs_file(filename):
        result = {}
        with open(filename, "r") as overrides:
            for line in overrides:
                if not line.strip():
                    continue

                tokens = line.split()
                if len(tokens) != 3:
                    die("Invalid line in %s:%s" % (filename, line))
                job_key = AuroraJobKey.from_path(tokens[0])
                result[job_key] = JobUpTimeLimit(
                    job=job_key,
                    percentage=parse_sla_percentage(tokens[1]),
                    duration_secs=parse_time(tokens[2]).as_(Time.SECONDS),
                )
        return result

    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)

    exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename)
    include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename)
    override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}
    get_grouping_or_die(options.grouping)

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(
        options.min_instance_count, include_hosts
    )
    groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs, options.grouping)

    results = []
    for group in groups:
        for host in sorted(group.keys()):
            if exclude_hosts and host in exclude_hosts:
                continue

            if options.list_jobs:
                results.append(
                    "\n".join(
                        [
                            "%s\t%s\t%.2f\t%d" % (host, d.job.to_path(), d.percentage, d.duration_secs)
                            for d in sorted(group[host])
                        ]
                    )
                )
            else:
                results.append("%s" % host)

    print_results(results)
コード例 #35
0
class HostMaintenance(object):
  """Submit requests to the scheduler to put hosts into and out of maintenance
  mode so they can be operated upon without causing LOST tasks.

  Aurora provides a two-tiered concept of Maintenance. The first step is to initiate maintenance,
  which will ask the Aurora scheduler to de-prioritize scheduling on a large set of hosts (the ones
  that will be operated upon during this maintenance window).  Once all hosts have been tagged in
  this manner, the operator can begin draining individual machines, which will have all user-tasks
  killed and rescheduled.  When the tasks get placed onto a new machine, the scheduler will first
  look for hosts that do not have the maintenance tag, which will help decrease churn and prevent a
  task from being constantly killed as its hosts go down from underneath it.
  """

  START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS)

  SLA_MIN_JOB_INSTANCE_COUNT = 20


  @classmethod
  def iter_batches(cls, hostnames, grouping_function=DEFAULT_GROUPING):
    groups = group_hosts(hostnames, grouping_function)
    groups = sorted(groups.items(), key=lambda v: v[0])
    for group in groups:
      yield Hosts(group[1])

  def __init__(self, cluster, verbosity):
    self._client = AuroraClientAPI(cluster, verbosity == 'verbose')

  def _drain_hosts(self, drainable_hosts, clock=time):
    """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :param clock: time module for testing
    :type clock: time
    """
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames]
    while not_ready_hostnames:
      log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY)
      clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
      resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames)))
      if not resp.result.maintenanceStatusResult.statuses:
        not_ready_hostnames = None
      for host_status in resp.result.maintenanceStatusResult.statuses:
        if host_status.mode != MaintenanceMode.DRAINED:
          log.warning('%s is currently in status %s' %
              (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
        else:
          not_ready_hostnames.remove(host_status.host)

  def _complete_maintenance(self, drained_hosts):
    """End the maintenance status for a given set of hosts.

    :param drained_hosts: Hosts that are drained and finished being operated upon
    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
    """
    check_and_log_response(self._client.end_maintenance(drained_hosts))
    resp = self._client.maintenance_status(drained_hosts)
    for host_status in resp.result.maintenanceStatusResult.statuses:
      if host_status.mode != MaintenanceMode.NONE:
        log.warning('%s is DRAINING or in DRAINED' % host_status.host)

  def _check_sla(self, hostnames, grouping_function, percentage, duration):
    """Check if the provided list of hosts passes the job uptime SLA check.

    This is an all-or-nothing check, meaning that all provided hosts must pass their job
    SLA check for the maintenance to proceed.

    :param hostnames: list of host names to check SLA for
    :type hostnames: list of strings
    :param grouping_function: grouping function to apply to the given hosts
    :type grouping_function: function
    :param percentage: SLA uptime percentage override
    :type percentage: float
    :param duration: SLA uptime duration override
    :type duration: twitter.common.quantity.Amount
    :rtype: set of unsafe hosts
    """
    vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
    host_groups = vector.probe_hosts(
      percentage,
      duration.as_(Time.SECONDS),
      grouping_function)

    unsafe_hostnames = set()
    # Given that maintenance is performed 1 group at a time, any result longer than 1 group
    # should be considered a batch failure.
    if host_groups:
      if len(host_groups) > 1:
        log.error('Illegal multiple groups detected in SLA results. Skipping hosts: %s' % hostnames)
        return set(hostnames)

      results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True)
      if results:
        print_results(results)
        return unsafe_hostnames

    return unsafe_hostnames

  def end_maintenance(self, hostnames):
    """Pull a list of hostnames out of maintenance mode.

    :param hostnames: List of hosts to operate upon
    :type hostnames: list of strings
    """
    self._complete_maintenance(Hosts(set(hostnames)))

  def start_maintenance(self, hostnames):
    """Put a list of hostnames into maintenance mode, to de-prioritize scheduling.

    This is part of two-phase draining- tasks will still be running on these hosts until
    drain_hosts is called upon them.

    :param hostnames: List of hosts to set for initial maintenance
    :type hostnames: list of strings
    :rtype: list of hostnames with the maintenance mode set
    """
    resp = self._client.start_maintenance(Hosts(set(hostnames)))
    check_and_log_response(resp)
    result = [host_status.host for host_status in resp.result.startMaintenanceResult.statuses]
    if len(result) != len(hostnames):
      log.warning('Skipping maintenance for unknown hosts: %s' % (set(hostnames) - set(result)))

    return result

  def perform_maintenance(self, hostnames, grouping_function=DEFAULT_GROUPING,
                          percentage=None, duration=None, output_file=None):
    """Put hosts into maintenance mode and drain them.

    Walk through the process of putting hosts into maintenance and draining them of tasks. The hosts
    will remain in maintenance mode upon completion.


    :param hostnames: A list of hostnames to operate upon
    :type hostnames: list of strings
    :param grouping_function: How to split up the hostname into groups
    :type grouping_function: function
    :param percentage: SLA percentage to use
    :type percentage: float
    :param duration: SLA duration to use
    :type duration: twitter.common.quantity.Time
    :param output_file: file to write hosts that were not drained due to failed SLA check
    :type output_file: string
    :rtype: set of host names that were successfully drained
    """
    hostnames = self.start_maintenance(hostnames)
    not_drained_hostnames = set()

    for hosts in self.iter_batches(hostnames, grouping_function):
      log.info('Beginning SLA check for %s' % hosts.hostNames)
      unsafe_hostnames = self._check_sla(
          list(hosts.hostNames),
          grouping_function,
          percentage,
          duration)

      if unsafe_hostnames:
        log.warning('Some hosts did not pass SLA check and will not be drained! '
                    'Skipping hosts: %s' % unsafe_hostnames)
        not_drained_hostnames |= unsafe_hostnames
        drainable_hostnames = hosts.hostNames - unsafe_hostnames
        if not drainable_hostnames:
          continue
        hosts = Hosts(drainable_hostnames)
      else:
        log.info('All hosts passed SLA check.')

      self._drain_hosts(hosts)

    if not_drained_hostnames:
      output = '\n'.join(list(not_drained_hostnames))
      log.info('The following hosts did not pass SLA check and were not drained:')
      print(output)
      if output_file:
        try:
          with open(output_file, 'w') as fp:
            fp.write(output)
            fp.write('\n')
          log.info('Written unsafe host names into: %s' % output_file)
        except IOError as e:
          log.error('Failed to write into the output file: %s' % e)

    return set(hostnames) - not_drained_hostnames

  def check_status(self, hostnames):
    """Query the scheduler to determine the maintenance status for a list of hostnames

    :param hostnames: Hosts to query for
    :type hostnames: list of strings
    :rtype: list of 2-tuples, hostname and MaintenanceMode
    """
    resp = self._client.maintenance_status(Hosts(set(hostnames)))
    check_and_log_response(resp)
    statuses = []
    for host_status in resp.result.maintenanceStatusResult.statuses:
      statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
    return statuses
コード例 #36
0
 def mock_api(cls):
     api = AuroraClientAPI(Cluster(name="foo"))
     mock_proxy = Mock()
     api._scheduler_proxy = mock_proxy
     return api, mock_proxy
コード例 #37
0
 def __init__(self, cluster, verbosity):
     self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
コード例 #38
0
ファイル: test_api.py プロジェクト: apache/aurora
 def mock_api(cls):
   api = AuroraClientAPI(Cluster(name="foo"), 'test-client')
   mock_proxy = create_autospec(spec=SchedulerProxyApiSpec, spec_set=True, instance=True)
   api._scheduler_proxy = mock_proxy
   return api, mock_proxy
コード例 #39
0
class DistributedCommandRunner(object):

  @classmethod
  def make_executor_path(cls, cluster, executor_name):
    parameters = cls.sandbox_args(cluster)
    parameters.update(executor_name=executor_name)
    return posixpath.join(
        '%(slave_root)s',
        'slaves/*/frameworks/*/executors/%(executor_name)s/runs',
        '%(slave_run_directory)s'
    ) % parameters

  @classmethod
  def thermos_sandbox(cls, cluster, executor_sandbox=False):
    sandbox = cls.make_executor_path(cluster, 'thermos-{{thermos.task_id}}')
    return sandbox if executor_sandbox else posixpath.join(sandbox, 'sandbox')

  @classmethod
  def sandbox_args(cls, cluster):
    cluster = cluster.with_trait(CommandRunnerTrait)
    return {'slave_root': cluster.slave_root, 'slave_run_directory': cluster.slave_run_directory}

  @classmethod
  def substitute(cls, command, task, cluster, **kw):
    prefix_command = 'cd %s;' % cls.thermos_sandbox(cluster, **kw)
    thermos_namespace = ThermosContext(
        task_id=task.assignedTask.taskId,
        ports=task.assignedTask.assignedPorts)
    mesos_namespace = MesosContext(instance=task.assignedTask.instanceId)
    command = String(prefix_command + command) % Environment(
        thermos=thermos_namespace,
        mesos=mesos_namespace)
    return command.get()

  @classmethod
  def query_from(cls, role, env, job):
    return TaskQuery(statuses=LIVE_STATES, jobKeys=[JobKey(role=role, environment=env, name=job)])

  def __init__(self, cluster, role, env, jobs, ssh_user=None,
      log_fn=log.log):
    self._cluster = cluster
    self._api = AuroraClientAPI(
        cluster=cluster,
        user_agent=AURORA_V2_USER_AGENT_NAME)
    self._role = role
    self._env = env
    self._jobs = jobs
    self._ssh_user = ssh_user if ssh_user else self._role
    self._log = log_fn

  def execute(self, args):
    hostname, role, command = args
    ssh_command = ['ssh', '-n', '-q', '%s@%s' % (role, hostname), command]
    self._log(logging.DEBUG, "Running command: %s" % ssh_command)
    po = subprocess.Popen(ssh_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    output = po.communicate()
    return '\n'.join('%s:  %s' % (hostname, line) for line in output[0].splitlines())

  def resolve(self):
    for job in self._jobs:
      resp = self._api.query(self.query_from(self._role, self._env, job))
      if resp.responseCode != ResponseCode.OK:
        self._log(logging.ERROR, 'Failed to query job: %s' % job)
        continue
      for task in resp.result.scheduleStatusResult.tasks:
        yield task

  def process_arguments(self, command, **kw):
    for task in self.resolve():
      host = task.assignedTask.slaveHost
      yield (host, self._ssh_user, self.substitute(command, task, self._cluster, **kw))

  def run(self, command, parallelism=1, **kw):
    threadpool = ThreadPool(processes=parallelism)
    for result in threadpool.imap_unordered(self.execute, self.process_arguments(command, **kw)):
      print(result)
コード例 #40
0
class DistributedCommandRunner(object):
    @classmethod
    def make_executor_path(cls, cluster, executor_name):
        parameters = cls.sandbox_args(cluster)
        parameters.update(executor_name=executor_name)
        return posixpath.join(
            '%(slave_root)s',
            'slaves/*/frameworks/*/executors/%(executor_name)s/runs',
            '%(slave_run_directory)s') % parameters

    @classmethod
    def thermos_sandbox(cls, cluster, executor_sandbox=False):
        sandbox = cls.make_executor_path(cluster,
                                         'thermos-{{thermos.task_id}}')
        return sandbox if executor_sandbox else posixpath.join(
            sandbox, 'sandbox')

    @classmethod
    def sandbox_args(cls, cluster):
        cluster = cluster.with_trait(CommandRunnerTrait)
        return {
            'slave_root': cluster.slave_root,
            'slave_run_directory': cluster.slave_run_directory
        }

    @classmethod
    def substitute_thermos(cls, command, task, cluster, **kw):
        prefix_command = 'cd %s;' % cls.thermos_sandbox(cluster, **kw)
        thermos_namespace = ThermosContext(
            task_id=task.assignedTask.taskId,
            ports=task.assignedTask.assignedPorts)
        mesos_namespace = MesosContext(instance=task.assignedTask.instanceId)
        command = String(prefix_command + command) % Environment(
            thermos=thermos_namespace, mesos=mesos_namespace)
        return command.get()

    @classmethod
    def aurora_sandbox(cls, cluster, executor_sandbox=False):
        if executor_sandbox:
            return cls.make_executor_path(cluster, 'twitter')
        else:
            return '/var/run/nexus/%task_id%/sandbox'

    @classmethod
    def substitute_aurora(cls, command, task, cluster, **kw):
        command = ('cd %s;' % cls.aurora_sandbox(cluster, **kw)) + command
        command = command.replace('%shard_id%',
                                  str(task.assignedTask.instanceId))
        command = command.replace('%task_id%', task.assignedTask.taskId)
        for name, port in task.assignedTask.assignedPorts.items():
            command = command.replace('%port:' + name + '%', str(port))
        return command

    @classmethod
    def substitute(cls, command, task, cluster, **kw):
        if task.assignedTask.task.executorConfig:
            return cls.substitute_thermos(command, task, cluster, **kw)
        else:
            return cls.substitute_aurora(command, task, cluster, **kw)

    @classmethod
    def query_from(cls, role, env, job):
        return TaskQuery(
            statuses=LIVE_STATES,
            jobKeys=[JobKey(role=role, environment=env, name=job)])

    def __init__(self,
                 cluster,
                 role,
                 env,
                 jobs,
                 ssh_user=None,
                 log_fn=log.log):
        self._cluster = cluster
        self._api = AuroraClientAPI(cluster=cluster,
                                    user_agent=AURORA_V2_USER_AGENT_NAME)
        self._role = role
        self._env = env
        self._jobs = jobs
        self._ssh_user = ssh_user if ssh_user else self._role
        self._log = log_fn

    def execute(self, args):
        hostname, role, command = args
        ssh_command = ['ssh', '-n', '-q', '%s@%s' % (role, hostname), command]
        self._log(logging.DEBUG, "Running command: %s" % ssh_command)
        po = subprocess.Popen(ssh_command,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.STDOUT)
        output = po.communicate()
        return '\n'.join('%s:  %s' % (hostname, line)
                         for line in output[0].splitlines())

    def resolve(self):
        for job in self._jobs:
            resp = self._api.query(self.query_from(self._role, self._env, job))
            if resp.responseCode != ResponseCode.OK:
                self._log(logging.ERROR, 'Failed to query job: %s' % job)
                continue
            for task in resp.result.scheduleStatusResult.tasks:
                yield task

    def process_arguments(self, command, **kw):
        for task in self.resolve():
            host = task.assignedTask.slaveHost
            yield (host, self._ssh_user,
                   self.substitute(command, task, self._cluster, **kw))

    def run(self, command, parallelism=1, **kw):
        threadpool = ThreadPool(processes=parallelism)
        for result in threadpool.imap_unordered(
                self.execute, self.process_arguments(command, **kw)):
            print(result)
コード例 #41
0
ファイル: admin.py プロジェクト: mkacik/incubator-aurora
def query(args, options):
    """usage: query [--force]
                  [--listformat=FORMAT]
                  [--shards=N[,N,...]]
                  [--states=State[,State,...]]
                  cluster [role [job]]

  Query Mesos about jobs and tasks.
  """

    def _convert_fmt_string(fmtstr):
        import re

        def convert(match):
            return "%%(%s)s" % match.group(1)

        return re.sub(r"%(\w+)%", convert, fmtstr)

    def flatten_task(t, d={}):
        for key in t.__dict__.keys():
            val = getattr(t, key)
            try:
                val.__dict__.keys()
            except AttributeError:
                d[key] = val
            else:
                flatten_task(val, d)

        return d

    def map_values(d):
        default_value = lambda v: v
        mapping = {"status": lambda v: ScheduleStatus._VALUES_TO_NAMES[v]}
        return dict((k, mapping.get(k, default_value)(v)) for (k, v) in d.items())

    for state in options.states.split(","):
        if state not in ScheduleStatus._NAMES_TO_VALUES:
            msg = "Unknown state '%s' specified.  Valid states are:\n" % state
            msg += ",".join(ScheduleStatus._NAMES_TO_VALUES.keys())
            die(msg)

    # Role, Job, Instances, States, and the listformat
    if len(args) == 0:
        die("Must specify at least cluster.")

    cluster = args[0]
    role = args[1] if len(args) > 1 else None
    job = args[2] if len(args) > 2 else None
    instances = set(map(int, options.shards.split(","))) if options.shards else set()

    if options.states:
        states = set(map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(",")))
    else:
        states = ACTIVE_STATES | TERMINAL_STATES
    listformat = _convert_fmt_string(options.listformat)

    #  Figure out "expensive" queries here and bone if they do not have --force
    #  - Does not specify role
    if not role and not options.force:
        die("--force is required for expensive queries (no role specified)")

    #  - Does not specify job
    if not job and not options.force:
        die("--force is required for expensive queries (no job specified)")

    #  - Specifies status outside of ACTIVE_STATES
    if not (states <= ACTIVE_STATES) and not options.force:
        die("--force is required for expensive queries (states outside ACTIVE states")

    api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
    query_info = api.query(api.build_query(role, job, instances=instances, statuses=states))
    if query_info.responseCode != ResponseCode.OK:
        die("Failed to query scheduler: %s" % query_info.messageDEPRECATED)

    tasks = query_info.result.scheduleStatusResult.tasks
    if tasks is None:
        return

    try:
        for task in tasks:
            d = flatten_task(task)
            print(listformat % map_values(d))
    except KeyError:
        msg = "Unknown key in format string.  Valid keys are:\n"
        msg += ",".join(d.keys())
        die(msg)
コード例 #42
0
 def __init__(self, cluster, verbosity):
   self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
コード例 #43
0
ファイル: admin.py プロジェクト: retrack/incubator-aurora
def sla_list_safe_domain(cluster, percentage, duration):
    """usage: sla_list_safe_domain
            [--exclude_hosts=filename]
            [--include_hosts=filename]
            [--list_jobs]
            [--override_jobs=filename]
            cluster percentage duration

  Returns a list of relevant hosts where it would be safe to kill
  tasks without violating their job SLA. The SLA is defined as a pair of
  percentage and duration, where:

  percentage - Percentage of tasks required to be up within the duration.
  Applied to all jobs except those listed in --override_jobs file;

  duration - Time interval (now - value) for the percentage of up tasks.
  Applied to all jobs except those listed in --override_jobs file.
  Format: XdYhZmWs (each field is optional but must be in that order.)
  Examples: 5m, 1d3h45m.
  """

    def parse_jobs_file(filename):
        result = {}
        with open(filename, "r") as overrides:
            for line in overrides:
                if not line.strip():
                    continue

                tokens = line.split()
                if len(tokens) != 3:
                    die("Invalid line in %s:%s" % (filename, line))
                job_key = AuroraJobKey.from_path(tokens[0])
                result[job_key] = DomainUpTimeSlaVector.JobUpTimeLimit(
                    job=job_key,
                    percentage=parse_sla_percentage(tokens[1]),
                    duration_secs=parse_time(tokens[2]).as_(Time.SECONDS),
                )
        return result

    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)

    exclude_hosts = parse_hosts_optional(options.exclude_hosts, options.exclude_filename)
    include_hosts = parse_hosts_optional(options.include_hosts, options.include_filename)
    override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(include_hosts)
    hosts = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs)

    results = []
    for host in sorted(hosts.keys()):
        if exclude_hosts and host in exclude_hosts:
            continue

        if options.list_jobs:
            results.append(
                "\n".join(
                    [
                        "%s\t%s\t%.2f\t%d" % (host, d.job.to_path(), d.percentage, d.duration_secs)
                        for d in sorted(hosts[host])
                    ]
                )
            )
        else:
            results.append("%s" % host)

    print_results(results)