def list_jobs(cluster_and_role): """usage: list_jobs [--show_cron_schedule] cluster/role/env/job""" def show_job_simple(job): if options.show_cron_schedule: print(('{0}/{1.key.role}/{1.key.environment}/{1.key.name}' + '\t\'{1.cronSchedule}\'\t{1.cronCollisionPolicy}').format(cluster, job)) else: print('{0}/{1.key.role}/{1.key.environment}/{1.key.name}'.format(cluster, job)) def show_job_pretty(job): print("Job %s/%s/%s/%s:" % (cluster, job.key.role, job.key.environment, job.key.name)) print('\tcron schedule: %s' % job.cronSchedule) print('\tcron policy: %s' % job.cronCollisionPolicy) options = app.get_options() if options.show_cron_schedule and options.pretty: print_fn = show_job_pretty else: print_fn = show_job_simple # Take the cluster_and_role parameter, and split it into its two components. if cluster_and_role.count('/') != 1: die('list_jobs parameter must be in cluster/role format') (cluster,role) = cluster_and_role.split('/') api = make_client(cluster) resp = api.get_jobs(role) check_and_log_response(resp) for job in resp.result.getJobsResult.configs: print_fn(job)
def do_open(args, _): """usage: open cluster[/role[/env/job]] Opens the scheduler page for a cluster, role or job in the default web browser. """ cluster_name = role = env = job = None args = args[0].split("/") if len(args) > 0: cluster_name = args[0] if len(args) > 1: role = args[1] if len(args) > 2: env = args[2] if len(args) > 3: job = args[3] else: # TODO(ksweeney): Remove this after MESOS-2945 is completed. die('env scheduler pages are not yet implemented, please specify job') if not cluster_name: die('cluster is required') api = make_client(cluster_name) import webbrowser webbrowser.open_new_tab(synthesize_url(api.scheduler.scheduler().url, role, env, job))
def diff(job_spec, config_file): """usage: diff cluster/role/env/job config Compares a job configuration against a running job. By default the diff will be displayed using 'diff', though you may choose an alternate diff program by specifying the DIFF_VIEWER environment variable.""" options = app.get_options() config = get_job_config(job_spec, config_file, options) if options.rename_from: cluster, role, env, name = options.rename_from else: cluster = config.cluster() role = config.role() env = config.environment() name = config.name() api = make_client(cluster) resp = api.query(api.build_query(role, name, statuses=ACTIVE_STATES, env=env)) if resp.responseCode != ResponseCode.OK: die('Request failed, server responded with "%s"' % resp.message) remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks] resp = api.populate_job_config(config) if resp.responseCode != ResponseCode.OK: die('Request failed, server responded with "%s"' % resp.message) local_tasks = resp.result.populateJobResult.populated pp = pprint.PrettyPrinter(indent=2) def pretty_print_task(task): # The raw configuration is not interesting - we only care about what gets parsed. task.configuration = None task.executorConfig = ExecutorConfig( name=AURORA_EXECUTOR_NAME, data=json.loads(task.executorConfig.data)) return pp.pformat(vars(task)) def pretty_print_tasks(tasks): return ',\n'.join([pretty_print_task(t) for t in tasks]) def dump_tasks(tasks, out_file): out_file.write(pretty_print_tasks(tasks)) out_file.write('\n') out_file.flush() diff_program = os.environ.get('DIFF_VIEWER', 'diff') with NamedTemporaryFile() as local: dump_tasks(local_tasks, local) with NamedTemporaryFile() as remote: dump_tasks(remote_tasks, remote) result = subprocess.call([diff_program, remote.name, local.name]) # Unlike most commands, diff doesn't return zero on success; it returns # 1 when a successful diff is non-empty. if result != 0 and result != 1: return result else: return 0
def get_quota(role): """usage: get_quota --cluster=CLUSTER role Prints the production quota that has been allocated to a user. """ options = app.get_options() resp = make_client(options.cluster).get_quota(role) quota = resp.result.getQuotaResult.quota quota_fields = [ ('CPU', quota.numCpus), ('RAM', '%f GB' % (float(quota.ramMb) / 1024)), ('Disk', '%f GB' % (float(quota.diskMb) / 1024)) ] log.info('Quota for %s:\n\t%s' % (role, '\n\t'.join(['%s\t%s' % (k, v) for (k, v) in quota_fields])))
def ssh(args, options): """usage: ssh cluster/role/env/job shard [args...] Initiate an SSH session on the machine that a shard is running on. """ if not args: die('Job path is required') job_path = args.pop(0) try: cluster_name, role, env, name = AuroraJobKey.from_path(job_path) except AuroraJobKey.Error as e: die('Invalid job path "%s": %s' % (job_path, e)) if not args: die('Shard is required') try: shard = int(args.pop(0)) except ValueError: die('Shard must be an integer') api = make_client(cluster_name) resp = api.query(api.build_query(role, name, set([int(shard)]), env=env)) check_and_log_response(resp) first_task = resp.result.scheduleStatusResult.tasks[0] remote_cmd = 'bash' if not args else ' '.join(args) command = DistributedCommandRunner.substitute(remote_cmd, first_task, api.cluster, executor_sandbox=options.executor_sandbox) ssh_command = ['ssh', '-t'] role = first_task.assignedTask.task.owner.role slave_host = first_task.assignedTask.slaveHost for tunnel in options.tunnels: try: port, name = tunnel.split(':') port = int(port) except ValueError: die('Could not parse tunnel: %s. Must be of form PORT:NAME' % tunnel) if name not in first_task.assignedTask.assignedPorts: die('Task %s has no port named %s' % (first_task.assignedTask.taskId, name)) ssh_command += [ '-L', '%d:%s:%d' % (port, slave_host, first_task.assignedTask.assignedPorts[name])] ssh_command += ['%s@%s' % (options.ssh_user or role, slave_host), command] return subprocess.call(ssh_command)
def update(job_spec, config_file): """usage: update cluster/role/env/job config Performs a rolling upgrade on a running job, using the update configuration within the config file as a control for update velocity and failure tolerance. Updates are fully controlled client-side, so aborting an update halts the update and leaves the job in a 'locked' state on the scheduler. Subsequent update attempts will fail until the update is 'unlocked' using the 'cancel_update' command. The updater only takes action on shards in a job that have changed, meaning that changing a single shard will only induce a restart on the changed shard. You may want to consider using the 'diff' subcommand before updating, to preview what changes will take effect. """ def warn_if_dangerous_change(api, job_spec, config): # Get the current job status, so that we can check if there's anything # dangerous about this update. job_key = AuroraJobKey(config.cluster(), config.role(), config.environment(), config.name()) resp = api.query(api.build_query(config.role(), config.name(), statuses=ACTIVE_STATES, env=config.environment())) if resp.responseCode != ResponseCode.OK: die('Could not get job status from server for comparison: %s' % resp.message) remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks] resp = api.populate_job_config(config) if resp.responseCode != ResponseCode.OK: die('Server could not populate job config for comparison: %s' % resp.message) local_task_count = len(resp.result.populateJobResult.populated) remote_task_count = len(remote_tasks) if (local_task_count >= 4 * remote_task_count or local_task_count <= 4 * remote_task_count or local_task_count == 0): print('Warning: this update is a large change. Press ^c within 5 seconds to abort') time.sleep(5) options = app.get_options() config = get_job_config(job_spec, config_file, options) api = make_client(config.cluster()) if not options.force: warn_if_dangerous_change(api, job_spec, config) resp = api.update_job(config, options.health_check_interval_seconds, options.shards) check_and_log_response(resp)
def create(job_spec, config_file): """usage: create cluster/role/env/job config Creates a job based on a configuration file. """ options = app.get_options() try: config = get_job_config(job_spec, config_file, options) except ValueError as v: print("Error: %s" % v) sys.exit(1) api = make_client(config.cluster()) monitor = JobMonitor(api, config.role(), config.environment(), config.name()) resp = api.create_job(config) check_and_log_response(resp) handle_open(api.scheduler.scheduler().url, config.role(), config.environment(), config.name()) if options.wait_until == 'RUNNING': monitor.wait_until(monitor.running_or_finished) elif options.wait_until == 'FINISHED': monitor.wait_until(monitor.terminal)
def get_api(self, cluster): """Creates an API object for a specified cluster""" return make_client(cluster)