def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str) disk = parse_data(disk_str) client = make_admin_client(cluster) resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota log.info('Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, quota.numCpus, quota.ramMb, quota.diskMb)) new_cpu = float(cpu + quota.numCpus) new_ram = int((ram + Amount(quota.ramMb, Data.MB)).as_(Data.MB)) new_disk = int((disk + Amount(quota.diskMb, Data.MB)).as_(Data.MB)) log.info( 'Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, new_cpu, new_ram, new_disk)) resp = client.set_quota(role, new_cpu, new_ram, new_disk) check_and_log_response(resp)
def _drain_hosts(self, drainable_hosts): """"Drains tasks from the specified hosts. This will move active tasks on these hosts to the DRAINING state, causing them to be rescheduled elsewhere. :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained :type drainable_hosts: gen.apache.aurora.ttypes.Hosts :rtype: set of host names failed to drain """ check_and_log_response(self._client.drain_hosts(drainable_hosts)) drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames] total_wait = self.STATUS_POLL_INTERVAL not_drained_hostnames = set(drainable_hostnames) while not self._wait_event.is_set() and not_drained_hostnames: self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS)) not_drained_hostnames = self.check_if_drained(drainable_hostnames) total_wait += self.STATUS_POLL_INTERVAL if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT: log.warning('Failed to move all hosts into DRAINED within %s' % self.MAX_STATUS_WAIT) break return not_drained_hostnames
def _drain_hosts(self, drainable_hosts): """"Drains tasks from the specified hosts. This will move active tasks on these hosts to the DRAINING state, causing them to be rescheduled elsewhere. :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained :type drainable_hosts: gen.apache.aurora.ttypes.Hosts """ check_and_log_response(self._client.drain_hosts(drainable_hosts)) not_ready_hostnames = [ hostname for hostname in drainable_hosts.hostNames ] while not_ready_hostnames: resp = self._client.maintenance_status( Hosts(set(not_ready_hostnames))) if not resp.result.maintenanceStatusResult.statuses: not_ready_hostnames = None for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.DRAINED: log.warning( '%s is currently in status %s' % (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) else: not_ready_hostnames.remove(host_status.host)
def scheduler_backup_now(cluster): """usage: scheduler_backup_now cluster Immediately initiates a full storage backup. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).perform_backup())
def scheduler_commit_recovery(cluster): """usage: scheduler_commit_recovery cluster Commits a staged recovery. """ check_and_log_response( make_admin_client_with_options(cluster).commit_recovery())
def status(args, options): """usage: status cluster/role/env/job Fetches and prints information about the active tasks in a job. """ def is_active(task): return task.status in ACTIVE_STATES def print_task(scheduled_task): assigned_task = scheduled_task.assignedTask taskInfo = assigned_task.task taskString = '' if taskInfo: taskString += '''cpus: %s, ram: %s MB, disk: %s MB''' % ( taskInfo.numCpus, taskInfo.ramMb, taskInfo.diskMb) if assigned_task.assignedPorts: taskString += '\n\tports: %s' % assigned_task.assignedPorts taskString += '\n\tfailure count: %s (max %s)' % ( scheduled_task.failureCount, taskInfo.maxTaskFailures) taskString += '\n\tevents:' for event in scheduled_task.taskEvents: taskString += '\n\t\t %s %s: %s' % ( datetime.fromtimestamp(event.timestamp / 1000), ScheduleStatus._VALUES_TO_NAMES[event.status], event.message) taskString += '\n\tmetadata:' if assigned_task.task.metadata is not None: for md in assigned_task.task.metadata: taskString += ('\n\t\t%s: %s' % (md.key, md.value)) return taskString def print_tasks(tasks): for task in tasks: taskString = print_task(task) log.info( 'role: %s, env: %s, name: %s, shard: %s, status: %s on %s\n%s' % (task.assignedTask.task.owner.role, task.assignedTask.task.environment, task.assignedTask.task.jobName, task.assignedTask.instanceId, ScheduleStatus._VALUES_TO_NAMES[task.status], task.assignedTask.slaveHost, taskString)) api, job_key, _ = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) v1_deprecation_warning("status", ["job", "status", args[0]]) resp = api.check_status(job_key) check_and_log_response(resp) tasks = resp.result.scheduleStatusResult.tasks if tasks: active_tasks = filter(is_active, tasks) log.info('Active Tasks (%s)' % len(active_tasks)) print_tasks(active_tasks) inactive_tasks = filter(lambda x: not is_active(x), tasks) log.info('Inactive Tasks (%s)' % len(inactive_tasks)) print_tasks(inactive_tasks) else: log.info('No tasks found.')
def really_killall(args, options): """Helper for testing purposes: make it easier to mock out the actual kill process, while testing hooks in the command dispatch process. """ maybe_disable_hooks(options) job_key = AuroraJobKey.from_path(args[0]) config_file = args[1] if len(args) > 1 else None # the config for hooks new_cmd = ["job", "killall", args[0]] if config_file is not None: new_cmd.append("--config=%s" % config_file) if options.open_browser: new_cmd.append("--open-browser") if options.batch_size is not None: new_cmd.append("--batch-size=%s" % options.batch_size) if options.max_total_failures is not None: new_cmd.append("--max-total-failures=%s" % options.max_total_failures) v1_deprecation_warning("killall", new_cmd) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None api = make_client(job_key.cluster) if options.batch_size is not None: kill_in_batches(api, job_key, None, options.batch_size, options.max_failures_option) else: resp = api.kill_job(job_key, None, config=config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name) wait_kill_tasks(api.scheduler_proxy, job_key)
def _drain_hosts(self, drainable_hosts, clock=time): """"Drains tasks from the specified hosts. This will move active tasks on these hosts to the DRAINING state, causing them to be rescheduled elsewhere. :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained :type drainable_hosts: gen.apache.aurora.ttypes.Hosts :param clock: time module for testing :type clock: time """ check_and_log_response(self._client.drain_hosts(drainable_hosts)) not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames] while not_ready_hostnames: log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY) clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS)) resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames))) if not resp.result.maintenanceStatusResult.statuses: not_ready_hostnames = None for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.DRAINED: log.warning('%s is currently in status %s' % (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) else: not_ready_hostnames.remove(host_status.host)
def scheduler_stage_recovery(cluster, backup_id): """usage: scheduler_stage_recovery cluster backup_id Stages a backup for recovery. """ check_and_log_response( make_admin_client_with_options(cluster).stage_recovery(backup_id))
def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str).as_(Data.MB) disk = parse_data(disk_str).as_(Data.MB) client = make_admin_client_with_options(cluster) resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota resource_details = ResourceManager.resource_details_from_quota(quota) log.info('Current quota for %s:\n\t%s' % (role, '\n\t'.join( '%s\t%s%s' % (r.resource_type.display_name, r.value, r.resource_type.display_unit) for r in resource_details))) new_cpu = ResourceType.CPUS.value_type( cpu + ResourceManager.quantity_of(resource_details, ResourceType.CPUS)) new_ram = ResourceType.RAM_MB.value_type( ram + ResourceManager.quantity_of(resource_details, ResourceType.RAM_MB)) new_disk = ResourceType.DISK_MB.value_type( disk + ResourceManager.quantity_of(resource_details, ResourceType.DISK_MB)) log.info( 'Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, new_cpu, new_ram, new_disk)) resp = client.set_quota(role, new_cpu, new_ram, new_disk) check_and_log_response(resp)
def really_start_cron(args, options): api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None resp = api.start_cronjob(job_key, config=config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str) disk = parse_data(disk_str) options = app.get_options() client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == "verbose") resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota log.info( "Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB" % (role, quota.numCpus, quota.ramMb, quota.diskMb) ) new_cpu = float(cpu + quota.numCpus) new_ram = int((ram + Amount(quota.ramMb, Data.MB)).as_(Data.MB)) new_disk = int((disk + Amount(quota.diskMb, Data.MB)).as_(Data.MB)) log.info( "Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB" % (role, new_cpu, new_ram, new_disk) ) resp = client.set_quota(role, new_cpu, new_ram, new_disk) check_and_log_response(resp)
def scheduler_unload_recovery(cluster): """usage: scheduler_unload_recovery cluster Unloads a staged recovery. """ check_and_log_response( make_admin_client_with_options(cluster).unload_recovery())
def scheduler_unload_recovery(cluster): """usage: scheduler_unload_recovery cluster Unloads a staged recovery. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).unload_recovery())
def scheduler_snapshot(cluster): """usage: scheduler_snapshot cluster Request that the scheduler perform a storage snapshot and block until complete. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).snapshot())
def scheduler_stage_recovery(cluster, backup_id): """usage: scheduler_stage_recovery cluster backup_id Stages a backup for recovery. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
def status(args, options): """usage: status cluster/role/env/job Fetches and prints information about the active tasks in a job. """ def is_active(task): return task.status in ACTIVE_STATES def print_task(scheduled_task): assigned_task = scheduled_task.assignedTask taskInfo = assigned_task.task taskString = '' if taskInfo: taskString += '''cpus: %s, ram: %s MB, disk: %s MB''' % (taskInfo.numCpus, taskInfo.ramMb, taskInfo.diskMb) if assigned_task.assignedPorts: taskString += '\n\tports: %s' % assigned_task.assignedPorts taskString += '\n\tfailure count: %s (max %s)' % (scheduled_task.failureCount, taskInfo.maxTaskFailures) taskString += '\n\tevents:' for event in scheduled_task.taskEvents: taskString += '\n\t\t %s %s: %s' % (datetime.fromtimestamp(event.timestamp / 1000), ScheduleStatus._VALUES_TO_NAMES[event.status], event.message) taskString += '\n\tmetadata:' if assigned_task.task.metadata is not None: for md in assigned_task.task.metadata: taskString += ('\n\t\t%s: %s' % (md.key, md.value)) return taskString def print_tasks(tasks): for task in tasks: taskString = print_task(task) log.info('role: %s, env: %s, name: %s, shard: %s, status: %s on %s\n%s' % (task.assignedTask.task.owner.role, task.assignedTask.task.environment, task.assignedTask.task.jobName, task.assignedTask.instanceId, ScheduleStatus._VALUES_TO_NAMES[task.status], task.assignedTask.slaveHost, taskString)) api, job_key, _ = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) resp = api.check_status(job_key) check_and_log_response(resp) tasks = resp.result.scheduleStatusResult.tasks if tasks: active_tasks = filter(is_active, tasks) log.info('Active Tasks (%s)' % len(active_tasks)) print_tasks(active_tasks) inactive_tasks = filter(lambda x: not is_active(x), tasks) log.info('Inactive Tasks (%s)' % len(inactive_tasks)) print_tasks(inactive_tasks) else: log.info('No tasks found.')
def _drain_hosts(self, drainable_hosts): """"Drains tasks from the specified hosts. This will move active tasks on these hosts to the DRAINING state, causing them to be rescheduled elsewhere. :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained :type drainable_hosts: gen.apache.aurora.ttypes.Hosts :rtype: set of host names failed to drain """ check_and_log_response(self._client.drain_hosts(drainable_hosts)) drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames] total_wait = self.STATUS_POLL_INTERVAL not_drained_hostnames = set(drainable_hostnames) while not self._wait_event.is_set() and not_drained_hostnames: log.info("Waiting for hosts to be in DRAINED: %s" % not_drained_hostnames) self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS)) statuses = self.check_status(list(not_drained_hostnames)) not_drained_hostnames = set(h[0] for h in statuses if h[1] != "DRAINED") total_wait += self.STATUS_POLL_INTERVAL if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT: log.warning( "Failed to move all hosts into DRAINED within %s:\n%s" % ( self.MAX_STATUS_WAIT, "\n".join("\tHost:%s\tStatus:%s" % h for h in sorted(statuses) if h[1] != "DRAINED"), ) ) break return not_drained_hostnames
def check_status(self, hosts): resp = self._client.maintenance_status(Hosts(set(hosts))) check_and_log_response(resp) statuses = [] for host_status in resp.result.maintenanceStatusResult.statuses: statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) return statuses
def scheduler_backup_now(cluster): """usage: scheduler_backup_now cluster Immediately initiates a full storage backup. """ check_and_log_response( make_admin_client_with_options(cluster).perform_backup())
def _drain_hosts(self, drainable_hosts): """"Drains tasks from the specified hosts. This will move active tasks on these hosts to the DRAINING state, causing them to be rescheduled elsewhere. :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained :type drainable_hosts: gen.apache.aurora.ttypes.Hosts :rtype: set of host names failed to drain """ check_and_log_response(self._client.drain_hosts(drainable_hosts)) drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames] total_wait = self.STATUS_POLL_INTERVAL not_drained_hostnames = set(drainable_hostnames) while not self._wait_event.is_set() and not_drained_hostnames: log.info('Waiting for hosts to be in DRAINED: %s' % not_drained_hostnames) self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS)) statuses = self.check_status(list(not_drained_hostnames)) not_drained_hostnames = set(h[0] for h in statuses if h[1] != 'DRAINED') total_wait += self.STATUS_POLL_INTERVAL if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT: log.warning('Failed to move all hosts into DRAINED within %s:\n%s' % (self.MAX_STATUS_WAIT, '\n'.join("\tHost:%s\tStatus:%s" % h for h in sorted(statuses) if h[1] != 'DRAINED'))) break return not_drained_hostnames
def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str).as_(Data.MB) disk = parse_data(disk_str).as_(Data.MB) client = make_admin_client_with_options(cluster) resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota resource_details = ResourceManager.resource_details_from_quota(quota) log.info('Current quota for %s:\n\t%s' % ( role, '\n\t'.join('%s\t%s%s' % ( r.resource_type.display_name, r.value, r.resource_type.display_unit) for r in resource_details))) new_cpu = ResourceType.CPUS.value_type( cpu + ResourceManager.quantity_of(resource_details, ResourceType.CPUS)) new_ram = ResourceType.RAM_MB.value_type( ram + ResourceManager.quantity_of(resource_details, ResourceType.RAM_MB)) new_disk = ResourceType.DISK_MB.value_type( disk + ResourceManager.quantity_of(resource_details, ResourceType.DISK_MB)) log.info('Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, new_cpu, new_ram, new_disk)) resp = client.set_quota(role, new_cpu, new_ram, new_disk) check_and_log_response(resp)
def list_jobs(cluster_and_role): """usage: list_jobs [--show-cron] cluster/role/env/job Shows all jobs that match the job-spec known by the scheduler. If --show-cron is specified, then also shows the registered cron schedule. """ def show_job_simple(job): if options.show_cron_schedule: print(('{0}/{1.key.role}/{1.key.environment}/{1.key.name}' + '\t\'{1.cronSchedule}\'\t{1.cronCollisionPolicy}').format(cluster, job)) else: print('{0}/{1.key.role}/{1.key.environment}/{1.key.name}'.format(cluster, job)) def show_job_pretty(job): print("Job %s/%s/%s/%s:" % (cluster, job.key.role, job.key.environment, job.key.name)) print('\tcron schedule: %s' % job.cronSchedule) print('\tcron policy: %s' % job.cronCollisionPolicy) options = app.get_options() if options.show_cron_schedule and options.pretty: print_fn = show_job_pretty else: print_fn = show_job_simple # Take the cluster_and_role parameter, and split it into its two components. if cluster_and_role.count('/') != 1: die('list_jobs parameter must be in cluster/role format') (cluster,role) = cluster_and_role.split('/') api = make_client(cluster) resp = api.get_jobs(role) check_and_log_response(resp) for job in resp.result.getJobsResult.configs: print_fn(job)
def restart(args, options): """usage: restart cluster/role/env/job [--shards=SHARDS] [--batch_size=INT] [--updater_health_check_interval_seconds=SECONDS] [--max_per_shard_failures=INT] [--max_total_failures=INT] [--restart_threshold=INT] [--watch_secs=SECONDS] Performs a rolling restart of shards within a job. Restarts are fully controlled client-side, so aborting halts the restart. """ api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None updater_config = UpdaterConfig( options.batch_size, options.restart_threshold, options.watch_secs, options.max_per_shard_failures, options.max_total_failures) resp = api.restart(job_key, options.shards, updater_config, options.health_check_interval_seconds, config=config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
def really_kill(args, options): if options.shards is None: print('Shards option is required for kill; use killall to kill all shards', file=sys.stderr) exit(1) api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) instance_key = str(job_key) if options.shards is not None: instance_key = "%s/%s" % (instance_key, ",".join(map(str, options.shards))) new_cmd = ["job", "kill", instance_key] if config_file is not None: new_cmd.append("--config=%s" % config_file) if options.open_browser: new_cmd.append("--open-browser") if options.batch_size is not None: new_cmd.append("--batch-size=%s" % options.batch_size) if options.max_total_failures is not None: new_cmd.append("--max-total-failures=%s" % options.max_total_failures) v1_deprecation_warning("kill", new_cmd) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None if options.batch_size is not None: kill_in_batches(api, job_key, options.shards, options.batch_size, options.max_failures_option) else: resp = api.kill_job(job_key, options.shards, config=config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name) wait_kill_tasks(api.scheduler_proxy, job_key, options.shards)
def really_update(job_spec, config_file, options): def warn_if_dangerous_change(api, job_spec, config): # Get the current job status, so that we can check if there's anything # dangerous about this update. resp = api.query_no_configs(api.build_query(config.role(), config.name(), statuses=ACTIVE_STATES, env=config.environment())) if resp.responseCode != ResponseCode.OK: die('Could not get job status from server for comparison: %s' % resp.messageDEPRECATED) remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks] resp = api.populate_job_config(config) if resp.responseCode != ResponseCode.OK: die('Server could not populate job config for comparison: %s' % resp.messageDEPRECATED) local_task_count = len(resp.result.populateJobResult.populated) remote_task_count = len(remote_tasks) if (local_task_count >= 4 * remote_task_count or local_task_count <= 4 * remote_task_count or local_task_count == 0): print('Warning: this update is a large change. Press ^c within 5 seconds to abort') time.sleep(5) maybe_disable_hooks(options) config = get_job_config(job_spec, config_file, options) api = make_client(config.cluster()) if not options.force: warn_if_dangerous_change(api, job_spec, config) resp = api.update_job(config, options.health_check_interval_seconds, options.shards) check_and_log_response(resp)
def really_kill(args, options): if options.shards is None: print( 'Shards option is required for kill; use killall to kill all shards', file=sys.stderr) exit(1) api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) instance_key = str(job_key) if options.shards is not None: instance_key = "%s/%s" % (instance_key, ",".join( map(str, options.shards))) new_cmd = ["job", "kill", instance_key] if config_file is not None: new_cmd.append("--config=%s" % config_file) if options.open_browser: new_cmd.append("--open-browser") if options.batch_size is not None: new_cmd.append("--batch-size=%s" % options.batch_size) if options.max_total_failures is not None: new_cmd.append("--max-total-failures=%s" % options.max_total_failures) v1_deprecation_warning("kill", new_cmd) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None if options.batch_size is not None: kill_in_batches(api, job_key, options.shards, options.batch_size, options.max_failures_option) else: resp = api.kill_job(job_key, options.shards, config=config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name) wait_kill_tasks(api.scheduler_proxy, job_key, options.shards)
def scheduler_delete_recovery_tasks(cluster, task_ids): """usage: scheduler_delete_recovery_tasks cluster task_ids Deletes a comma-separated list of task IDs from a staged recovery. """ ids = set(task_ids.split(',')) check_and_log_response(make_admin_client(cluster).delete_recovery_tasks(TaskQuery(taskIds=ids)))
def _complete_maintenance(self, drained_hosts): """End the maintenance status for a give set of hosts.""" check_and_log_response(self._client.end_maintenance(drained_hosts)) resp = self._client.maintenance_status(drained_hosts) for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.NONE: log.warning('%s is DRAINING or in DRAINED' % host_status.host)
def scheduler_stage_recovery(cluster, backup_id): """usage: scheduler_stage_recovery cluster backup_id Stages a backup for recovery. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
def scheduler_unload_recovery(cluster): """usage: scheduler_unload_recovery cluster Unloads a staged recovery. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity) .unload_recovery())
def really_cancel_update(args, options): api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) new_cmd = ["job", "cancel-update", str(job_key)] v1_deprecation_warning("cancel_update", new_cmd) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None resp = api.cancel_update(job_key, config=config) check_and_log_response(resp)
def ssh(args, options): """usage: ssh cluster/role/env/job shard [args...] Initiate an SSH session on the machine that a shard is running on. """ if not args: die('Job path is required') job_path = args.pop(0) try: cluster_name, role, env, name = AuroraJobKey.from_path(job_path) except AuroraJobKey.Error as e: die('Invalid job path "%s": %s' % (job_path, e)) if not args: die('Shard is required') try: shard = int(args.pop(0)) except ValueError: die('Shard must be an integer') newcmd = ["task", "ssh", "%s/%s" % (job_path, shard)] if len(options.tunnels) > 0: newcmd.append("--tunnels=%s" % options.tunnels) if options.ssh_user is not None: newcmd.append("--ssh-user=%s" % options.ssh_user) if options.executor_sandbox: newcmd.append("--executor-sandbox") if len(args) > 0: newcmd.append("--command=\"%s\"" % " ".join(args)) v1_deprecation_warning("ssh", newcmd) api = make_client(cluster_name) resp = api.query(api.build_query(role, name, set([int(shard)]), env=env)) check_and_log_response(resp) first_task = resp.result.scheduleStatusResult.tasks[0] remote_cmd = 'bash' if not args else ' '.join(args) command = DistributedCommandRunner.substitute(remote_cmd, first_task, api.cluster, executor_sandbox=options.executor_sandbox) ssh_command = ['ssh', '-t'] role = first_task.assignedTask.task.owner.role slave_host = first_task.assignedTask.slaveHost for tunnel in options.tunnels: try: port, name = tunnel.split(':') port = int(port) except ValueError: die('Could not parse tunnel: %s. Must be of form PORT:NAME' % tunnel) if name not in first_task.assignedTask.assignedPorts: die('Task %s has no port named %s' % (first_task.assignedTask.taskId, name)) ssh_command += [ '-L', '%d:%s:%d' % (port, slave_host, first_task.assignedTask.assignedPorts[name])] ssh_command += ['%s@%s' % (options.ssh_user or role, slave_host), command] return subprocess.call(ssh_command)
def scheduler_delete_recovery_tasks(cluster, task_ids): """usage: scheduler_delete_recovery_tasks cluster task_ids Deletes a comma-separated list of task IDs from a staged recovery. """ ids = set(task_ids.split(',')) options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity) .delete_recovery_tasks(TaskQuery(taskIds=ids)))
def start_maintenance(self, hostnames): """Put a list of hostnames into maintenance mode, to de-prioritize scheduling. This is part of two-phase draining- tasks will still be running on these hosts until drain_hosts is called upon them. :param hostnames: List of hosts to set for initial maintenance :type hostnames: list of strings """ check_and_log_response(self._client.start_maintenance(Hosts(set(hostnames))))
def scheduler_list_backups(cluster): """usage: scheduler_list_backups cluster Lists backups available for recovery. """ resp = make_admin_client(cluster).list_backups() check_and_log_response(resp) backups = resp.result.listBackupsResult.backups print('%s available backups:' % len(backups)) for backup in backups: print(backup)
def _complete_maintenance(self, drained_hosts): """End the maintenance status for a given set of hosts. :param drained_hosts: Hosts that are drained and finished being operated upon :type drained_hosts: gen.apache.aurora.ttypes.Hosts """ check_and_log_response(self._client.end_maintenance(drained_hosts)) resp = self._client.maintenance_status(drained_hosts) for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.NONE: log.warning('%s is DRAINING or in DRAINED' % host_status.host)
def scheduler_list_backups(cluster): """usage: scheduler_list_backups cluster Lists backups available for recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups() check_and_log_response(resp) backups = resp.result.listBackupsResult.backups print('%s available backups:' % len(backups)) for backup in backups: print(backup)
def killall(args, options): """usage: killall cluster/role/env/job Kills all tasks in a running job, blocking until all specified tasks have been terminated. """ job_key = AuroraJobKey.from_path(args[0]) config_file = args[1] if len(args) > 1 else None # the config for hooks config = get_job_config(job_key.to_path(), config_file, options) if config_file else None api = make_client(job_key.cluster) resp = api.kill_job(job_key, None, config=config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
def scheduler_list_backups(cluster): """usage: scheduler_list_backups cluster Lists backups available for recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups() check_and_log_response(resp) backups = resp.result.listBackupsResult.backups print("%s available backups:" % len(backups)) for backup in backups: print(backup)
def cancel_update(args, options): """usage: cancel_update cluster/role/env/job Unlocks a job for updates. A job may be locked if a client's update session terminated abnormally, or if another user is actively updating the job. This command should only be used when the user is confident that they are not conflicting with another user. """ api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None resp = api.cancel_update(job_key, config=config) check_and_log_response(resp)
def check_status(self, hostnames): """Query the scheduler to determine the maintenance status for a list of hostnames :param hostnames: Hosts to query for :type hostnames: list of strings :rtype: list of 2-tuples, hostname and MaintenanceMode """ resp = self._client.maintenance_status(Hosts(set(hostnames))) check_and_log_response(resp) statuses = [] for host_status in resp.result.maintenanceStatusResult.statuses: statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) return statuses
def get_locks(cluster): """usage: get_locks cluster Prints all context/operation locks in the scheduler. """ resp = make_admin_client(cluster).get_locks() check_and_log_response(resp) pp = pprint.PrettyPrinter(indent=2) def pretty_print_lock(lock): return pp.pformat(vars(lock)) print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
def get_locks(cluster): """usage: get_locks cluster Prints all context/operation locks in the scheduler. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_locks() check_and_log_response(resp) pp = pprint.PrettyPrinter(indent=2) def pretty_print_lock(lock): return pp.pformat(vars(lock)) print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
def scheduler_print_recovery_tasks(cluster): """usage: scheduler_print_recovery_tasks cluster Prints all active tasks in a staged recovery. """ resp = make_admin_client_with_options(cluster).query_recovery( TaskQuery(statuses=ACTIVE_STATES)) check_and_log_response(resp) log.info('Role\tJob\tShard\tStatus\tTask ID') for task in resp.result.queryRecoveryResult.tasks: assigned = task.assignedTask conf = assigned.task log.info('\t'.join( (conf.job.role, conf.job.name, str(assigned.instanceId), ScheduleStatus._VALUES_TO_NAMES[task.status], assigned.taskId)))
def test_check_and_log_response(self, mock_sys_exit, mock_log): resp = Response(responseCode=ResponseCode.LOCK_ERROR) out = base.check_and_log_response(resp) self.assertIsNone(out) mock_sys_exit.assert_called_once_with(1) mock_log.assert_any_call( 'Response from scheduler: LOCK_ERROR (message: )')
def start_maintenance(self, hostnames): """Put a list of hostnames into maintenance mode, to de-prioritize scheduling. This is part of two-phase draining- tasks will still be running on these hosts until drain_hosts is called upon them. :param hostnames: List of hosts to set for initial maintenance :type hostnames: list of strings :rtype: list of hostnames with the maintenance mode set """ resp = self._client.start_maintenance(Hosts(set(hostnames))) check_and_log_response(resp) result = [host_status.host for host_status in resp.result.startMaintenanceResult.statuses] if len(result) != len(hostnames): log.warning('Skipping maintenance for unknown hosts: %s' % (set(hostnames) - set(result))) return result
def really_create(job_spec, config_file, options): try: config = get_job_config(job_spec, config_file, options) except ValueError as v: print("Error: %s" % v) sys.exit(1) api = make_client(config.cluster()) resp = api.create_job(config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, config.role(), config.environment(), config.name()) if options.wait_until == 'RUNNING': JobMonitor(api.scheduler_proxy, config.job_key()).wait_until(JobMonitor.running_or_finished) elif options.wait_until == 'FINISHED': JobMonitor(api.scheduler_proxy, config.job_key()).wait_until(JobMonitor.terminal)
def scheduler_print_recovery_tasks(cluster): """usage: scheduler_print_recovery_tasks cluster Prints all active tasks in a staged recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).query_recovery( TaskQuery(statuses=ACTIVE_STATES)) check_and_log_response(resp) log.info('Role\tJob\tShard\tStatus\tTask ID') for task in resp.result.queryRecoveryResult.tasks: assigned = task.assignedTask conf = assigned.task log.info('\t'.join((conf.owner.role, conf.jobName, str(assigned.instanceId), ScheduleStatus._VALUES_TO_NAMES[task.status], assigned.taskId)))