Example #1
0
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
    """usage: increase_quota cluster role cpu ram[unit] disk[unit]

  Increases the amount of production quota allocated to a user.
  """
    cpu = float(cpu_str)
    ram = parse_data(ram_str)
    disk = parse_data(disk_str)

    client = make_admin_client(cluster)
    resp = client.get_quota(role)
    quota = resp.result.getQuotaResult.quota
    log.info('Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' %
             (role, quota.numCpus, quota.ramMb, quota.diskMb))

    new_cpu = float(cpu + quota.numCpus)
    new_ram = int((ram + Amount(quota.ramMb, Data.MB)).as_(Data.MB))
    new_disk = int((disk + Amount(quota.diskMb, Data.MB)).as_(Data.MB))

    log.info(
        'Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB'
        % (role, new_cpu, new_ram, new_disk))

    resp = client.set_quota(role, new_cpu, new_ram, new_disk)
    check_and_log_response(resp)
  def _drain_hosts(self, drainable_hosts):
    """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :rtype: set of host names failed to drain
    """
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]

    total_wait = self.STATUS_POLL_INTERVAL
    not_drained_hostnames = set(drainable_hostnames)
    while not self._wait_event.is_set() and not_drained_hostnames:
      self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))

      not_drained_hostnames = self.check_if_drained(drainable_hostnames)

      total_wait += self.STATUS_POLL_INTERVAL
      if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
        log.warning('Failed to move all hosts into DRAINED within %s' % self.MAX_STATUS_WAIT)
        break

    return not_drained_hostnames
Example #3
0
    def _drain_hosts(self, drainable_hosts):
        """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    """
        check_and_log_response(self._client.drain_hosts(drainable_hosts))
        not_ready_hostnames = [
            hostname for hostname in drainable_hosts.hostNames
        ]
        while not_ready_hostnames:
            resp = self._client.maintenance_status(
                Hosts(set(not_ready_hostnames)))
            if not resp.result.maintenanceStatusResult.statuses:
                not_ready_hostnames = None
            for host_status in resp.result.maintenanceStatusResult.statuses:
                if host_status.mode != MaintenanceMode.DRAINED:
                    log.warning(
                        '%s is currently in status %s' %
                        (host_status.host,
                         MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
                else:
                    not_ready_hostnames.remove(host_status.host)
Example #4
0
def scheduler_backup_now(cluster):
  """usage: scheduler_backup_now cluster

  Immediately initiates a full storage backup.
  """
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).perform_backup())
Example #5
0
def scheduler_commit_recovery(cluster):
    """usage: scheduler_commit_recovery cluster

  Commits a staged recovery.
  """
    check_and_log_response(
        make_admin_client_with_options(cluster).commit_recovery())
Example #6
0
def status(args, options):
    """usage: status cluster/role/env/job

  Fetches and prints information about the active tasks in a job.
  """
    def is_active(task):
        return task.status in ACTIVE_STATES

    def print_task(scheduled_task):
        assigned_task = scheduled_task.assignedTask
        taskInfo = assigned_task.task
        taskString = ''
        if taskInfo:
            taskString += '''cpus: %s, ram: %s MB, disk: %s MB''' % (
                taskInfo.numCpus, taskInfo.ramMb, taskInfo.diskMb)
        if assigned_task.assignedPorts:
            taskString += '\n\tports: %s' % assigned_task.assignedPorts
        taskString += '\n\tfailure count: %s (max %s)' % (
            scheduled_task.failureCount, taskInfo.maxTaskFailures)
        taskString += '\n\tevents:'
        for event in scheduled_task.taskEvents:
            taskString += '\n\t\t %s %s: %s' % (
                datetime.fromtimestamp(event.timestamp / 1000),
                ScheduleStatus._VALUES_TO_NAMES[event.status], event.message)
        taskString += '\n\tmetadata:'
        if assigned_task.task.metadata is not None:
            for md in assigned_task.task.metadata:
                taskString += ('\n\t\t%s: %s' % (md.key, md.value))

        return taskString

    def print_tasks(tasks):
        for task in tasks:
            taskString = print_task(task)

            log.info(
                'role: %s, env: %s, name: %s, shard: %s, status: %s on %s\n%s'
                %
                (task.assignedTask.task.owner.role,
                 task.assignedTask.task.environment,
                 task.assignedTask.task.jobName, task.assignedTask.instanceId,
                 ScheduleStatus._VALUES_TO_NAMES[task.status],
                 task.assignedTask.slaveHost, taskString))

    api, job_key, _ = LiveJobDisambiguator.disambiguate_args_or_die(
        args, options, make_client_factory())
    v1_deprecation_warning("status", ["job", "status", args[0]])
    resp = api.check_status(job_key)
    check_and_log_response(resp)

    tasks = resp.result.scheduleStatusResult.tasks
    if tasks:
        active_tasks = filter(is_active, tasks)
        log.info('Active Tasks (%s)' % len(active_tasks))
        print_tasks(active_tasks)
        inactive_tasks = filter(lambda x: not is_active(x), tasks)
        log.info('Inactive Tasks (%s)' % len(inactive_tasks))
        print_tasks(inactive_tasks)
    else:
        log.info('No tasks found.')
Example #7
0
def really_killall(args, options):
    """Helper for testing purposes: make it easier to mock out the actual kill process,
  while testing hooks in the command dispatch process.
  """
    maybe_disable_hooks(options)
    job_key = AuroraJobKey.from_path(args[0])
    config_file = args[1] if len(args) > 1 else None  # the config for hooks
    new_cmd = ["job", "killall", args[0]]
    if config_file is not None:
        new_cmd.append("--config=%s" % config_file)
    if options.open_browser:
        new_cmd.append("--open-browser")
    if options.batch_size is not None:
        new_cmd.append("--batch-size=%s" % options.batch_size)
    if options.max_total_failures is not None:
        new_cmd.append("--max-total-failures=%s" % options.max_total_failures)
    v1_deprecation_warning("killall", new_cmd)

    config = get_job_config(job_key.to_path(), config_file,
                            options) if config_file else None
    api = make_client(job_key.cluster)
    if options.batch_size is not None:
        kill_in_batches(api, job_key, None, options.batch_size,
                        options.max_failures_option)
    else:
        resp = api.kill_job(job_key, None, config=config)
        check_and_log_response(resp)
    handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role,
                job_key.env, job_key.name)
    wait_kill_tasks(api.scheduler_proxy, job_key)
  def _drain_hosts(self, drainable_hosts, clock=time):
    """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :param clock: time module for testing
    :type clock: time
    """
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames]
    while not_ready_hostnames:
      log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY)
      clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
      resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames)))
      if not resp.result.maintenanceStatusResult.statuses:
        not_ready_hostnames = None
      for host_status in resp.result.maintenanceStatusResult.statuses:
        if host_status.mode != MaintenanceMode.DRAINED:
          log.warning('%s is currently in status %s' %
              (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
        else:
          not_ready_hostnames.remove(host_status.host)
Example #9
0
def scheduler_stage_recovery(cluster, backup_id):
    """usage: scheduler_stage_recovery cluster backup_id

  Stages a backup for recovery.
  """
    check_and_log_response(
        make_admin_client_with_options(cluster).stage_recovery(backup_id))
Example #10
0
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
    """usage: increase_quota cluster role cpu ram[unit] disk[unit]

  Increases the amount of production quota allocated to a user.
  """
    cpu = float(cpu_str)
    ram = parse_data(ram_str).as_(Data.MB)
    disk = parse_data(disk_str).as_(Data.MB)

    client = make_admin_client_with_options(cluster)
    resp = client.get_quota(role)
    quota = resp.result.getQuotaResult.quota
    resource_details = ResourceManager.resource_details_from_quota(quota)
    log.info('Current quota for %s:\n\t%s' % (role, '\n\t'.join(
        '%s\t%s%s' %
        (r.resource_type.display_name, r.value, r.resource_type.display_unit)
        for r in resource_details)))

    new_cpu = ResourceType.CPUS.value_type(
        cpu + ResourceManager.quantity_of(resource_details, ResourceType.CPUS))
    new_ram = ResourceType.RAM_MB.value_type(
        ram +
        ResourceManager.quantity_of(resource_details, ResourceType.RAM_MB))
    new_disk = ResourceType.DISK_MB.value_type(
        disk +
        ResourceManager.quantity_of(resource_details, ResourceType.DISK_MB))

    log.info(
        'Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB'
        % (role, new_cpu, new_ram, new_disk))

    resp = client.set_quota(role, new_cpu, new_ram, new_disk)
    check_and_log_response(resp)
Example #11
0
def really_start_cron(args, options):
  api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die(
      args, options, make_client_factory())
  config = get_job_config(job_key.to_path(), config_file, options) if config_file else None
  resp = api.start_cronjob(job_key, config=config)
  check_and_log_response(resp)
  handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
Example #12
0
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
    """usage: increase_quota cluster role cpu ram[unit] disk[unit]

  Increases the amount of production quota allocated to a user.
  """
    cpu = float(cpu_str)
    ram = parse_data(ram_str)
    disk = parse_data(disk_str)

    options = app.get_options()
    client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == "verbose")
    resp = client.get_quota(role)
    quota = resp.result.getQuotaResult.quota
    log.info(
        "Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB"
        % (role, quota.numCpus, quota.ramMb, quota.diskMb)
    )

    new_cpu = float(cpu + quota.numCpus)
    new_ram = int((ram + Amount(quota.ramMb, Data.MB)).as_(Data.MB))
    new_disk = int((disk + Amount(quota.diskMb, Data.MB)).as_(Data.MB))

    log.info(
        "Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB"
        % (role, new_cpu, new_ram, new_disk)
    )

    resp = client.set_quota(role, new_cpu, new_ram, new_disk)
    check_and_log_response(resp)
Example #13
0
def scheduler_unload_recovery(cluster):
    """usage: scheduler_unload_recovery cluster

  Unloads a staged recovery.
  """
    check_and_log_response(
        make_admin_client_with_options(cluster).unload_recovery())
Example #14
0
def scheduler_unload_recovery(cluster):
    """usage: scheduler_unload_recovery cluster

  Unloads a staged recovery.
  """
    options = app.get_options()
    check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).unload_recovery())
Example #15
0
def scheduler_snapshot(cluster):
    """usage: scheduler_snapshot cluster

  Request that the scheduler perform a storage snapshot and block until complete.
  """
    options = app.get_options()
    check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).snapshot())
Example #16
0
def really_killall(args, options):
  """Helper for testing purposes: make it easier to mock out the actual kill process,
  while testing hooks in the command dispatch process.
  """
  maybe_disable_hooks(options)
  job_key = AuroraJobKey.from_path(args[0])
  config_file = args[1] if len(args) > 1 else None  # the config for hooks
  new_cmd = ["job", "killall", args[0]]
  if config_file is not None:
    new_cmd.append("--config=%s" % config_file)
  if options.open_browser:
    new_cmd.append("--open-browser")
  if options.batch_size is not None:
    new_cmd.append("--batch-size=%s" % options.batch_size)
  if options.max_total_failures is not None:
    new_cmd.append("--max-total-failures=%s" % options.max_total_failures)
  v1_deprecation_warning("killall", new_cmd)

  config = get_job_config(job_key.to_path(), config_file, options) if config_file else None
  api = make_client(job_key.cluster)
  if options.batch_size is not None:
    kill_in_batches(api, job_key, None, options.batch_size, options.max_failures_option)
  else:
    resp = api.kill_job(job_key, None, config=config)
    check_and_log_response(resp)
  handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
  wait_kill_tasks(api.scheduler_proxy, job_key)
Example #17
0
def scheduler_stage_recovery(cluster, backup_id):
    """usage: scheduler_stage_recovery cluster backup_id

  Stages a backup for recovery.
  """
    options = app.get_options()
    check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
Example #18
0
def status(args, options):
  """usage: status cluster/role/env/job

  Fetches and prints information about the active tasks in a job.
  """
  def is_active(task):
    return task.status in ACTIVE_STATES

  def print_task(scheduled_task):
    assigned_task = scheduled_task.assignedTask
    taskInfo = assigned_task.task
    taskString = ''
    if taskInfo:
      taskString += '''cpus: %s, ram: %s MB, disk: %s MB''' % (taskInfo.numCpus,
                                                               taskInfo.ramMb,
                                                               taskInfo.diskMb)
    if assigned_task.assignedPorts:
      taskString += '\n\tports: %s' % assigned_task.assignedPorts
    taskString += '\n\tfailure count: %s (max %s)' % (scheduled_task.failureCount,
                                                      taskInfo.maxTaskFailures)
    taskString += '\n\tevents:'
    for event in scheduled_task.taskEvents:
      taskString += '\n\t\t %s %s: %s' % (datetime.fromtimestamp(event.timestamp / 1000),
                                          ScheduleStatus._VALUES_TO_NAMES[event.status],
                                          event.message)
    taskString += '\n\tmetadata:'
    if assigned_task.task.metadata is not None:
      for md in assigned_task.task.metadata:
        taskString += ('\n\t\t%s: %s' % (md.key, md.value))

    return taskString

  def print_tasks(tasks):
    for task in tasks:
      taskString = print_task(task)

      log.info('role: %s, env: %s, name: %s, shard: %s, status: %s on %s\n%s' %
             (task.assignedTask.task.owner.role,
              task.assignedTask.task.environment,
              task.assignedTask.task.jobName,
              task.assignedTask.instanceId,
              ScheduleStatus._VALUES_TO_NAMES[task.status],
              task.assignedTask.slaveHost,
              taskString))

  api, job_key, _ = LiveJobDisambiguator.disambiguate_args_or_die(
      args, options, make_client_factory())
  resp = api.check_status(job_key)
  check_and_log_response(resp)

  tasks = resp.result.scheduleStatusResult.tasks
  if tasks:
    active_tasks = filter(is_active, tasks)
    log.info('Active Tasks (%s)' % len(active_tasks))
    print_tasks(active_tasks)
    inactive_tasks = filter(lambda x: not is_active(x), tasks)
    log.info('Inactive Tasks (%s)' % len(inactive_tasks))
    print_tasks(inactive_tasks)
  else:
    log.info('No tasks found.')
Example #19
0
def scheduler_snapshot(cluster):
  """usage: scheduler_snapshot cluster

  Request that the scheduler perform a storage snapshot and block until complete.
  """
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).snapshot())
Example #20
0
    def _drain_hosts(self, drainable_hosts):
        """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :rtype: set of host names failed to drain
    """
        check_and_log_response(self._client.drain_hosts(drainable_hosts))
        drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]

        total_wait = self.STATUS_POLL_INTERVAL
        not_drained_hostnames = set(drainable_hostnames)
        while not self._wait_event.is_set() and not_drained_hostnames:
            log.info("Waiting for hosts to be in DRAINED: %s" % not_drained_hostnames)
            self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))

            statuses = self.check_status(list(not_drained_hostnames))
            not_drained_hostnames = set(h[0] for h in statuses if h[1] != "DRAINED")

            total_wait += self.STATUS_POLL_INTERVAL
            if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
                log.warning(
                    "Failed to move all hosts into DRAINED within %s:\n%s"
                    % (
                        self.MAX_STATUS_WAIT,
                        "\n".join("\tHost:%s\tStatus:%s" % h for h in sorted(statuses) if h[1] != "DRAINED"),
                    )
                )
                break

        return not_drained_hostnames
 def check_status(self, hosts):
   resp = self._client.maintenance_status(Hosts(set(hosts)))
   check_and_log_response(resp)
   statuses = []
   for host_status in resp.result.maintenanceStatusResult.statuses:
     statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
   return statuses
Example #22
0
def scheduler_backup_now(cluster):
    """usage: scheduler_backup_now cluster

  Immediately initiates a full storage backup.
  """
    check_and_log_response(
        make_admin_client_with_options(cluster).perform_backup())
Example #23
0
  def _drain_hosts(self, drainable_hosts):
    """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :rtype: set of host names failed to drain
    """
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]

    total_wait = self.STATUS_POLL_INTERVAL
    not_drained_hostnames = set(drainable_hostnames)
    while not self._wait_event.is_set() and not_drained_hostnames:
      log.info('Waiting for hosts to be in DRAINED: %s' % not_drained_hostnames)
      self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))

      statuses = self.check_status(list(not_drained_hostnames))
      not_drained_hostnames = set(h[0] for h in statuses if h[1] != 'DRAINED')

      total_wait += self.STATUS_POLL_INTERVAL
      if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
        log.warning('Failed to move all hosts into DRAINED within %s:\n%s' %
            (self.MAX_STATUS_WAIT,
            '\n'.join("\tHost:%s\tStatus:%s" % h for h in sorted(statuses) if h[1] != 'DRAINED')))
        break

    return not_drained_hostnames
Example #24
0
def increase_quota(cluster, role, cpu_str, ram_str, disk_str):
  """usage: increase_quota cluster role cpu ram[unit] disk[unit]

  Increases the amount of production quota allocated to a user.
  """
  cpu = float(cpu_str)
  ram = parse_data(ram_str).as_(Data.MB)
  disk = parse_data(disk_str).as_(Data.MB)

  client = make_admin_client_with_options(cluster)
  resp = client.get_quota(role)
  quota = resp.result.getQuotaResult.quota
  resource_details = ResourceManager.resource_details_from_quota(quota)
  log.info('Current quota for %s:\n\t%s' % (
      role,
      '\n\t'.join('%s\t%s%s' % (
          r.resource_type.display_name,
          r.value,
          r.resource_type.display_unit) for r in resource_details)))

  new_cpu = ResourceType.CPUS.value_type(
    cpu + ResourceManager.quantity_of(resource_details, ResourceType.CPUS))
  new_ram = ResourceType.RAM_MB.value_type(
    ram + ResourceManager.quantity_of(resource_details, ResourceType.RAM_MB))
  new_disk = ResourceType.DISK_MB.value_type(
    disk + ResourceManager.quantity_of(resource_details, ResourceType.DISK_MB))

  log.info('Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' %
           (role, new_cpu, new_ram, new_disk))

  resp = client.set_quota(role, new_cpu, new_ram, new_disk)
  check_and_log_response(resp)
Example #25
0
def list_jobs(cluster_and_role):
  """usage: list_jobs [--show-cron] cluster/role/env/job

  Shows all jobs that match the job-spec known by the scheduler.
  If --show-cron is specified, then also shows the registered cron schedule.
  """
  def show_job_simple(job):
    if options.show_cron_schedule:
      print(('{0}/{1.key.role}/{1.key.environment}/{1.key.name}' +
          '\t\'{1.cronSchedule}\'\t{1.cronCollisionPolicy}').format(cluster, job))
    else:
      print('{0}/{1.key.role}/{1.key.environment}/{1.key.name}'.format(cluster, job))

  def show_job_pretty(job):
    print("Job %s/%s/%s/%s:" %
        (cluster, job.key.role, job.key.environment, job.key.name))
    print('\tcron schedule: %s' % job.cronSchedule)
    print('\tcron policy:   %s' % job.cronCollisionPolicy)

  options = app.get_options()
  if options.show_cron_schedule and options.pretty:
    print_fn = show_job_pretty
  else:
    print_fn = show_job_simple
  # Take the cluster_and_role parameter, and split it into its two components.
  if cluster_and_role.count('/') != 1:
    die('list_jobs parameter must be in cluster/role format')
  (cluster,role) = cluster_and_role.split('/')
  api = make_client(cluster)
  resp = api.get_jobs(role)
  check_and_log_response(resp)
  for job in resp.result.getJobsResult.configs:
    print_fn(job)
Example #26
0
def restart(args, options):
  """usage: restart cluster/role/env/job
               [--shards=SHARDS]
               [--batch_size=INT]
               [--updater_health_check_interval_seconds=SECONDS]
               [--max_per_shard_failures=INT]
               [--max_total_failures=INT]
               [--restart_threshold=INT]
               [--watch_secs=SECONDS]

  Performs a rolling restart of shards within a job.

  Restarts are fully controlled client-side, so aborting halts the restart.
  """
  api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die(
      args, options, make_client_factory())
  config = get_job_config(job_key.to_path(), config_file, options) if config_file else None
  updater_config = UpdaterConfig(
      options.batch_size,
      options.restart_threshold,
      options.watch_secs,
      options.max_per_shard_failures,
      options.max_total_failures)
  resp = api.restart(job_key, options.shards, updater_config,
      options.health_check_interval_seconds, config=config)
  check_and_log_response(resp)
  handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
Example #27
0
def really_kill(args, options):
  if options.shards is None:
    print('Shards option is required for kill; use killall to kill all shards', file=sys.stderr)
    exit(1)
  api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die(
      args, options, make_client_factory())
  instance_key = str(job_key)
  if options.shards is not None:
    instance_key = "%s/%s" % (instance_key, ",".join(map(str, options.shards)))
  new_cmd = ["job", "kill", instance_key]
  if config_file is not None:
    new_cmd.append("--config=%s" % config_file)
  if options.open_browser:
    new_cmd.append("--open-browser")
  if options.batch_size is not None:
    new_cmd.append("--batch-size=%s" % options.batch_size)
  if options.max_total_failures is not None:
    new_cmd.append("--max-total-failures=%s" % options.max_total_failures)
  v1_deprecation_warning("kill", new_cmd)

  config = get_job_config(job_key.to_path(), config_file, options) if config_file else None
  if options.batch_size is not None:
    kill_in_batches(api, job_key, options.shards, options.batch_size, options.max_failures_option)
  else:
    resp = api.kill_job(job_key, options.shards, config=config)
    check_and_log_response(resp)
  handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
  wait_kill_tasks(api.scheduler_proxy, job_key, options.shards)
Example #28
0
def really_update(job_spec, config_file, options):
  def warn_if_dangerous_change(api, job_spec, config):
    # Get the current job status, so that we can check if there's anything
    # dangerous about this update.
    resp = api.query_no_configs(api.build_query(config.role(), config.name(),
        statuses=ACTIVE_STATES, env=config.environment()))
    if resp.responseCode != ResponseCode.OK:
      die('Could not get job status from server for comparison: %s' % resp.messageDEPRECATED)
    remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks]
    resp = api.populate_job_config(config)
    if resp.responseCode != ResponseCode.OK:
      die('Server could not populate job config for comparison: %s' % resp.messageDEPRECATED)
    local_task_count = len(resp.result.populateJobResult.populated)
    remote_task_count = len(remote_tasks)
    if (local_task_count >= 4 * remote_task_count or local_task_count <= 4 * remote_task_count
        or local_task_count == 0):
      print('Warning: this update is a large change. Press ^c within 5 seconds to abort')
      time.sleep(5)

  maybe_disable_hooks(options)
  config = get_job_config(job_spec, config_file, options)
  api = make_client(config.cluster())
  if not options.force:
    warn_if_dangerous_change(api, job_spec, config)
  resp = api.update_job(config, options.health_check_interval_seconds, options.shards)
  check_and_log_response(resp)
Example #29
0
def really_kill(args, options):
    if options.shards is None:
        print(
            'Shards option is required for kill; use killall to kill all shards',
            file=sys.stderr)
        exit(1)
    api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die(
        args, options, make_client_factory())
    instance_key = str(job_key)
    if options.shards is not None:
        instance_key = "%s/%s" % (instance_key, ",".join(
            map(str, options.shards)))
    new_cmd = ["job", "kill", instance_key]
    if config_file is not None:
        new_cmd.append("--config=%s" % config_file)
    if options.open_browser:
        new_cmd.append("--open-browser")
    if options.batch_size is not None:
        new_cmd.append("--batch-size=%s" % options.batch_size)
    if options.max_total_failures is not None:
        new_cmd.append("--max-total-failures=%s" % options.max_total_failures)
    v1_deprecation_warning("kill", new_cmd)

    config = get_job_config(job_key.to_path(), config_file,
                            options) if config_file else None
    if options.batch_size is not None:
        kill_in_batches(api, job_key, options.shards, options.batch_size,
                        options.max_failures_option)
    else:
        resp = api.kill_job(job_key, options.shards, config=config)
        check_and_log_response(resp)
    handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role,
                job_key.env, job_key.name)
    wait_kill_tasks(api.scheduler_proxy, job_key, options.shards)
Example #30
0
def scheduler_delete_recovery_tasks(cluster, task_ids):
  """usage: scheduler_delete_recovery_tasks cluster task_ids

  Deletes a comma-separated list of task IDs from a staged recovery.
  """
  ids = set(task_ids.split(','))
  check_and_log_response(make_admin_client(cluster).delete_recovery_tasks(TaskQuery(taskIds=ids)))
 def _complete_maintenance(self, drained_hosts):
   """End the maintenance status for a give set of hosts."""
   check_and_log_response(self._client.end_maintenance(drained_hosts))
   resp = self._client.maintenance_status(drained_hosts)
   for host_status in resp.result.maintenanceStatusResult.statuses:
     if host_status.mode != MaintenanceMode.NONE:
       log.warning('%s is DRAINING or in DRAINED' % host_status.host)
Example #32
0
def scheduler_backup_now(cluster):
    """usage: scheduler_backup_now cluster

  Immediately initiates a full storage backup.
  """
    options = app.get_options()
    check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).perform_backup())
Example #33
0
def scheduler_stage_recovery(cluster, backup_id):
  """usage: scheduler_stage_recovery cluster backup_id

  Stages a backup for recovery.
  """
  options = app.get_options()
  check_and_log_response(
      AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
Example #34
0
def scheduler_unload_recovery(cluster):
  """usage: scheduler_unload_recovery cluster

  Unloads a staged recovery.
  """
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
      .unload_recovery())
Example #35
0
def really_cancel_update(args, options):
  api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die(
      args, options, make_client_factory())
  new_cmd = ["job", "cancel-update", str(job_key)]
  v1_deprecation_warning("cancel_update", new_cmd)
  config = get_job_config(job_key.to_path(), config_file, options) if config_file else None
  resp = api.cancel_update(job_key, config=config)
  check_and_log_response(resp)
Example #36
0
def ssh(args, options):
  """usage: ssh cluster/role/env/job shard [args...]

  Initiate an SSH session on the machine that a shard is running on.
  """
  if not args:
    die('Job path is required')
  job_path = args.pop(0)
  try:
    cluster_name, role, env, name = AuroraJobKey.from_path(job_path)
  except AuroraJobKey.Error as e:
    die('Invalid job path "%s": %s' % (job_path, e))
  if not args:
    die('Shard is required')
  try:
    shard = int(args.pop(0))
  except ValueError:
    die('Shard must be an integer')

  newcmd = ["task", "ssh", "%s/%s" % (job_path, shard)]
  if len(options.tunnels) > 0:
    newcmd.append("--tunnels=%s" % options.tunnels)
  if options.ssh_user is not None:
    newcmd.append("--ssh-user=%s" % options.ssh_user)
  if options.executor_sandbox:
    newcmd.append("--executor-sandbox")
  if len(args) > 0:
    newcmd.append("--command=\"%s\"" % " ".join(args))
  v1_deprecation_warning("ssh", newcmd)

  api = make_client(cluster_name)
  resp = api.query(api.build_query(role, name, set([int(shard)]), env=env))
  check_and_log_response(resp)

  first_task = resp.result.scheduleStatusResult.tasks[0]
  remote_cmd = 'bash' if not args else ' '.join(args)
  command = DistributedCommandRunner.substitute(remote_cmd, first_task,
      api.cluster, executor_sandbox=options.executor_sandbox)

  ssh_command = ['ssh', '-t']


  role = first_task.assignedTask.task.owner.role
  slave_host = first_task.assignedTask.slaveHost

  for tunnel in options.tunnels:
    try:
      port, name = tunnel.split(':')
      port = int(port)
    except ValueError:
      die('Could not parse tunnel: %s.  Must be of form PORT:NAME' % tunnel)
    if name not in first_task.assignedTask.assignedPorts:
      die('Task %s has no port named %s' % (first_task.assignedTask.taskId, name))
    ssh_command += [
        '-L', '%d:%s:%d' % (port, slave_host, first_task.assignedTask.assignedPorts[name])]

  ssh_command += ['%s@%s' % (options.ssh_user or role, slave_host), command]
  return subprocess.call(ssh_command)
Example #37
0
def scheduler_delete_recovery_tasks(cluster, task_ids):
  """usage: scheduler_delete_recovery_tasks cluster task_ids

  Deletes a comma-separated list of task IDs from a staged recovery.
  """
  ids = set(task_ids.split(','))
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
      .delete_recovery_tasks(TaskQuery(taskIds=ids)))
Example #38
0
def really_cancel_update(args, options):
    api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die(
        args, options, make_client_factory())
    new_cmd = ["job", "cancel-update", str(job_key)]
    v1_deprecation_warning("cancel_update", new_cmd)
    config = get_job_config(job_key.to_path(), config_file,
                            options) if config_file else None
    resp = api.cancel_update(job_key, config=config)
    check_and_log_response(resp)
Example #39
0
def scheduler_delete_recovery_tasks(cluster, task_ids):
  """usage: scheduler_delete_recovery_tasks cluster task_ids

  Deletes a comma-separated list of task IDs from a staged recovery.
  """
  ids = set(task_ids.split(','))
  options = app.get_options()
  check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
      .delete_recovery_tasks(TaskQuery(taskIds=ids)))
Example #40
0
def really_start_cron(args, options):
    api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die(
        args, options, make_client_factory())
    config = get_job_config(job_key.to_path(), config_file,
                            options) if config_file else None
    resp = api.start_cronjob(job_key, config=config)
    check_and_log_response(resp)
    handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role,
                job_key.env, job_key.name)
  def start_maintenance(self, hostnames):
    """Put a list of hostnames into maintenance mode, to de-prioritize scheduling.

    This is part of two-phase draining- tasks will still be running on these hosts until
    drain_hosts is called upon them.

    :param hostnames: List of hosts to set for initial maintenance
    :type hostnames: list of strings
    """
    check_and_log_response(self._client.start_maintenance(Hosts(set(hostnames))))
Example #42
0
def scheduler_list_backups(cluster):
  """usage: scheduler_list_backups cluster

  Lists backups available for recovery.
  """
  resp = make_admin_client(cluster).list_backups()
  check_and_log_response(resp)
  backups = resp.result.listBackupsResult.backups
  print('%s available backups:' % len(backups))
  for backup in backups:
    print(backup)
  def _complete_maintenance(self, drained_hosts):
    """End the maintenance status for a given set of hosts.

    :param drained_hosts: Hosts that are drained and finished being operated upon
    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
    """
    check_and_log_response(self._client.end_maintenance(drained_hosts))
    resp = self._client.maintenance_status(drained_hosts)
    for host_status in resp.result.maintenanceStatusResult.statuses:
      if host_status.mode != MaintenanceMode.NONE:
        log.warning('%s is DRAINING or in DRAINED' % host_status.host)
Example #44
0
    def _complete_maintenance(self, drained_hosts):
        """End the maintenance status for a given set of hosts.

    :param drained_hosts: Hosts that are drained and finished being operated upon
    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
    """
        check_and_log_response(self._client.end_maintenance(drained_hosts))
        resp = self._client.maintenance_status(drained_hosts)
        for host_status in resp.result.maintenanceStatusResult.statuses:
            if host_status.mode != MaintenanceMode.NONE:
                log.warning('%s is DRAINING or in DRAINED' % host_status.host)
Example #45
0
def scheduler_list_backups(cluster):
  """usage: scheduler_list_backups cluster

  Lists backups available for recovery.
  """
  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups()
  check_and_log_response(resp)
  backups = resp.result.listBackupsResult.backups
  print('%s available backups:' % len(backups))
  for backup in backups:
    print(backup)
Example #46
0
def killall(args, options):
  """usage: killall cluster/role/env/job
  Kills all tasks in a running job, blocking until all specified tasks have been terminated.
  """

  job_key = AuroraJobKey.from_path(args[0])
  config_file = args[1] if len(args) > 1 else None  # the config for hooks
  config = get_job_config(job_key.to_path(), config_file, options) if config_file else None
  api = make_client(job_key.cluster)
  resp = api.kill_job(job_key, None, config=config)
  check_and_log_response(resp)
  handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
Example #47
0
def scheduler_list_backups(cluster):
    """usage: scheduler_list_backups cluster

  Lists backups available for recovery.
  """
    options = app.get_options()
    resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups()
    check_and_log_response(resp)
    backups = resp.result.listBackupsResult.backups
    print("%s available backups:" % len(backups))
    for backup in backups:
        print(backup)
Example #48
0
def cancel_update(args, options):
  """usage: cancel_update cluster/role/env/job

  Unlocks a job for updates.
  A job may be locked if a client's update session terminated abnormally,
  or if another user is actively updating the job.  This command should only
  be used when the user is confident that they are not conflicting with another user.
  """
  api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die(
      args, options, make_client_factory())
  config = get_job_config(job_key.to_path(), config_file, options) if config_file else None
  resp = api.cancel_update(job_key, config=config)
  check_and_log_response(resp)
  def check_status(self, hostnames):
    """Query the scheduler to determine the maintenance status for a list of hostnames

    :param hostnames: Hosts to query for
    :type hostnames: list of strings
    :rtype: list of 2-tuples, hostname and MaintenanceMode
    """
    resp = self._client.maintenance_status(Hosts(set(hostnames)))
    check_and_log_response(resp)
    statuses = []
    for host_status in resp.result.maintenanceStatusResult.statuses:
      statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
    return statuses
Example #50
0
  def check_status(self, hostnames):
    """Query the scheduler to determine the maintenance status for a list of hostnames

    :param hostnames: Hosts to query for
    :type hostnames: list of strings
    :rtype: list of 2-tuples, hostname and MaintenanceMode
    """
    resp = self._client.maintenance_status(Hosts(set(hostnames)))
    check_and_log_response(resp)
    statuses = []
    for host_status in resp.result.maintenanceStatusResult.statuses:
      statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
    return statuses
Example #51
0
def get_locks(cluster):
  """usage: get_locks cluster

  Prints all context/operation locks in the scheduler.
  """
  resp = make_admin_client(cluster).get_locks()
  check_and_log_response(resp)

  pp = pprint.PrettyPrinter(indent=2)
  def pretty_print_lock(lock):
    return pp.pformat(vars(lock))

  print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
Example #52
0
def get_locks(cluster):
  """usage: get_locks cluster

  Prints all context/operation locks in the scheduler.
  """
  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_locks()
  check_and_log_response(resp)

  pp = pprint.PrettyPrinter(indent=2)
  def pretty_print_lock(lock):
    return pp.pformat(vars(lock))

  print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
Example #53
0
def scheduler_print_recovery_tasks(cluster):
    """usage: scheduler_print_recovery_tasks cluster

  Prints all active tasks in a staged recovery.
  """
    resp = make_admin_client_with_options(cluster).query_recovery(
        TaskQuery(statuses=ACTIVE_STATES))
    check_and_log_response(resp)
    log.info('Role\tJob\tShard\tStatus\tTask ID')
    for task in resp.result.queryRecoveryResult.tasks:
        assigned = task.assignedTask
        conf = assigned.task
        log.info('\t'.join(
            (conf.job.role, conf.job.name, str(assigned.instanceId),
             ScheduleStatus._VALUES_TO_NAMES[task.status], assigned.taskId)))
Example #54
0
 def test_check_and_log_response(self, mock_sys_exit, mock_log):
     resp = Response(responseCode=ResponseCode.LOCK_ERROR)
     out = base.check_and_log_response(resp)
     self.assertIsNone(out)
     mock_sys_exit.assert_called_once_with(1)
     mock_log.assert_any_call(
         'Response from scheduler: LOCK_ERROR (message: )')
Example #55
0
  def start_maintenance(self, hostnames):
    """Put a list of hostnames into maintenance mode, to de-prioritize scheduling.

    This is part of two-phase draining- tasks will still be running on these hosts until
    drain_hosts is called upon them.

    :param hostnames: List of hosts to set for initial maintenance
    :type hostnames: list of strings
    :rtype: list of hostnames with the maintenance mode set
    """
    resp = self._client.start_maintenance(Hosts(set(hostnames)))
    check_and_log_response(resp)
    result = [host_status.host for host_status in resp.result.startMaintenanceResult.statuses]
    if len(result) != len(hostnames):
      log.warning('Skipping maintenance for unknown hosts: %s' % (set(hostnames) - set(result)))

    return result
Example #56
0
def really_create(job_spec, config_file, options):
    try:
        config = get_job_config(job_spec, config_file, options)
    except ValueError as v:
        print("Error: %s" % v)
        sys.exit(1)
    api = make_client(config.cluster())
    resp = api.create_job(config)
    check_and_log_response(resp)
    handle_open(api.scheduler_proxy.scheduler_client().url, config.role(),
                config.environment(), config.name())
    if options.wait_until == 'RUNNING':
        JobMonitor(api.scheduler_proxy,
                   config.job_key()).wait_until(JobMonitor.running_or_finished)
    elif options.wait_until == 'FINISHED':
        JobMonitor(api.scheduler_proxy,
                   config.job_key()).wait_until(JobMonitor.terminal)
Example #57
0
def scheduler_print_recovery_tasks(cluster):
  """usage: scheduler_print_recovery_tasks cluster

  Prints all active tasks in a staged recovery.
  """
  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).query_recovery(
      TaskQuery(statuses=ACTIVE_STATES))
  check_and_log_response(resp)
  log.info('Role\tJob\tShard\tStatus\tTask ID')
  for task in resp.result.queryRecoveryResult.tasks:
    assigned = task.assignedTask
    conf = assigned.task
    log.info('\t'.join((conf.owner.role,
                        conf.jobName,
                        str(assigned.instanceId),
                        ScheduleStatus._VALUES_TO_NAMES[task.status],
                        assigned.taskId)))