def from_assigned_task(self, assigned_task, _): mesos_task = mesos_task_instance_from_assigned_task(assigned_task) if not mesos_task.has_announce(): return None portmap = resolve_ports(mesos_task, assigned_task.assignedPorts) # assigned_task.slaveHost is the --hostname argument passed into the mesos slave. # Using this allows overriding the hostname published into ZK when announcing. # If no argument was passed to the mesos-slave, the slave falls back to gethostname(). endpoint, additional = make_endpoints( assigned_task.slaveHost, portmap, mesos_task.announce().primary_port().get()) client = self.make_zk_client() if mesos_task.announce().has_zk_path(): if self.__allow_custom_serverset_path: path = mesos_task.announce().zk_path().get() else: app.error('Executor must be started with --announcer-allow-custom-serverset-path in order ' 'to use zk_path in the Announcer config') else: path = self.make_zk_path(assigned_task) initial_interval = mesos_task.health_check_config().initial_interval_secs().get() interval = mesos_task.health_check_config().interval_secs().get() consecutive_failures = mesos_task.health_check_config().max_consecutive_failures().get() timeout_secs = initial_interval + (consecutive_failures * interval) return AnnouncerChecker( client, path, timeout_secs, endpoint, additional=additional, shard=assigned_task.instanceId, name=self.name)
def delete(args, options): validate_common_options(options) with open(options.password_file, 'r') as f: password = f.read().strip() if not password: app.error("Empty password file") url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) values = dict(password=password) req = urllib2.Request(url, urllib.urlencode(values)) req.get_method = lambda: 'DELETE' try: response = urllib2.urlopen(req).read() except urllib2.HTTPError as e: log.error("DELETE request failed: %s, %s, %s" % ( e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) app.quit(1) try: result = json.loads(response) if not isinstance(result, dict): raise ValueError() except ValueError: log.error("Invalid response: %s" % response) app.quit(1) log.info("Cluster deletion result: %s" % result) log.info("Waiting for the cluster to terminate...") wait_for_termination(result['cluster_url']) log.info("Cluster terminated/deleted")
def really_run(task, root, sandbox, task_id=None, user=None, prebound_ports=None, chroot=None, daemon=False): prebound_ports = prebound_ports or {} missing_ports = set(task.ports()) - set(prebound_ports.keys()) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner(task.task, root, sandbox, task_id=task_id, user=user, portmap=prebound_ports, chroot=chroot) if daemon: print('Daemonizing and starting runner.') try: log.teardown_stderr_logging() daemonize() except Exception as e: print("Failed to daemonize: %s" % e) sys.exit(1) try: task_runner.run() except KeyboardInterrupt: print('Got keyboard interrupt, killing job!') task_runner.close_ckpt() task_runner.kill()
def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error("Expected a task to tail, got nothing!") if len(args) not in (1, 2): app.error("Expected at most two arguments (task and optional process), got %d" % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print("ERROR: No processes found.", file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print("ERROR: More than one process matches query.", file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath( "process_logdir" ) logfile = os.path.join(logdir, "stderr" if options.use_stderr else "stdout") monitor = TaskMonitor(TaskPath(root=options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print("Tail of terminal log %s" % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print("Tail of active log %s" % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0
def main(args, options): thermos_runner_provider = DefaultThermosTaskRunnerProvider( dump_runner_pex(), artifact_dir=os.path.realpath('.'), ) # status providers: status_providers = [HealthCheckerProvider()] if options.announcer_enable: if options.announcer_ensemble is None: app.error('Must specify --announcer-ensemble if the announcer is enabled.') status_providers.append(DefaultAnnouncerCheckerProvider( options.announcer_ensemble, options.announcer_serverset_path)) # Create executor stub thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers, ) # Create driver stub driver = MesosExecutorDriver(thermos_executor) # This is an ephemeral executor -- shutdown if we receive no tasks within a certain # time period ExecutorTimeout(thermos_executor.launched, driver).start() # Start executor driver.run() log.info('MesosExecutorDriver.run() has finished.')
def get_task_from_options(args, opts, **kw): loader = ThermosConfigLoader.load_json if opts.json else ThermosConfigLoader.load if len(args) != 1: app.error('Should specify precisely one config, instead got: %s' % args) tasks = loader(args[0], bindings=opts.bindings, **kw) task_list = list(tasks.tasks()) if len(task_list) == 0: app.error("No tasks specified!") if opts.task is None and len(task_list) > 1: app.error("Multiple tasks in config but no task name specified!") task = None if opts.task is not None: for t in task_list: if t.task().name().get() == opts.task: task = t break if task is None: app.error("Could not find task %s!" % opts.task) else: task = task_list[0] if kw.get('strict', False): if not task.task.check().ok(): app.error(task.task.check().message()) return task
def create(args, options): validate_common_options(options) if not options.num_nodes: app.error("--num_nodes is required") if not options.cluster_user: app.error("--cluster_user is required") url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) values = dict( num_nodes=int(options.num_nodes), cluster_user=options.cluster_user, size=options.size if options.size else '', backup_id=options.backup_id if options.backup_id else '') req = urllib2.Request(url, urllib.urlencode(values)) try: response = urllib2.urlopen(req).read() except urllib2.HTTPError as e: log.error("POST request failed: %s, %s, %s" % ( e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) app.quit(1) try: result = json.loads(response) if not isinstance(result, dict): raise ValueError() except ValueError: log.error("Invalid response: %s" % response) app.quit(1) log.info("Cluster created. Cluster info: %s" % str(result)) with open(options.password_file, 'w') as f: f.write(result["cluster_password"]) log.info("Waiting for the master for this cluster to be elected...") master_endpoint = wait_for_master(result['cluster_url']).service_endpoint connection_str = "mysql://%s:%s@%s:%d/" % ( options.cluster_user, result["cluster_password"], master_endpoint.host, master_endpoint.port) log.info("Connecting to the MySQL cluster master: %s" % connection_str) engine = create_engine(connection_str) for i in range(5): # Loop for 5 times/seconds to wait for the master to be promoted. try: # TODO(jyx): Test writing to the master and reading from the slave. result = engine.execute("SELECT 1;").scalar() assert 1 == int(result), "Expecting result to be 1 but got %s" % result break except OperationalError: if i == 4: raise log.debug("MySQL master not ready yet. Sleep for 1 second...") time.sleep(1) log.info("Cluster successfully started")
def add(file_or_dir): if os.path.isfile(file_or_dir): add_file(file_or_dir) elif os.path.isdir(file_or_dir): add_dir(file_or_dir) else: app.error("Unknown or non-existent file: %s" % file_or_dir)
def generate_token_interactive(): password = getpass('Enter your Subsonic password: '******'Enter a salt (an integer of at least six digits): ') if len(salt) < 6 or not salt.isdigit(): app.error('Salt value is not an integer of at least six digits.') token = md5(password + salt).hexdigest() print 'Your API token is: {}'.format(token) print 'This must be used with the same salt value entered during this session.'
def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error('Expected a task to tail, got nothing!') if len(args) not in (1, 2): app.error('Expected at most two arguments (task and optional process), got %d' % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print('ERROR: No processes found.', file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print('ERROR: More than one process matches query.', file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath('process_logdir') logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout') monitor = TaskMonitor(TaskPath(root=options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print('Tail of terminal log %s' % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print('Tail of active log %s' % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0
def initialize(options): cwd_path = os.path.abspath(CWD) checkpoint_root = os.path.join(cwd_path, MesosPathDetector.DEFAULT_SANDBOX_PATH) # status providers: status_providers = [ HealthCheckerProvider(), ResourceManagerProvider(checkpoint_root=checkpoint_root) ] if options.announcer_enable: if options.announcer_ensemble is None: app.error('Must specify --announcer-ensemble if the announcer is enabled.') status_providers.append(DefaultAnnouncerCheckerProvider( options.announcer_ensemble, options.announcer_serverset_path, options.announcer_allow_custom_serverset_path )) # Create executor stub if options.execute_as_user or options.nosetuid: # If nosetuid is set, execute_as_user is also None thermos_runner_provider = UserOverrideThermosTaskRunnerProvider( dump_runner_pex(), checkpoint_root, artifact_dir=cwd_path, process_logger_destination=options.runner_logger_destination, process_logger_mode=options.runner_logger_mode, rotate_log_size_mb=options.runner_rotate_log_size_mb, rotate_log_backups=options.runner_rotate_log_backups, preserve_env=options.preserve_env ) thermos_runner_provider.set_role(None) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers, sandbox_provider=UserOverrideDirectorySandboxProvider(options.execute_as_user) ) else: thermos_runner_provider = DefaultThermosTaskRunnerProvider( dump_runner_pex(), checkpoint_root, artifact_dir=cwd_path, process_logger_destination=options.runner_logger_destination, process_logger_mode=options.runner_logger_mode, rotate_log_size_mb=options.runner_rotate_log_size_mb, rotate_log_backups=options.runner_rotate_log_backups, preserve_env=options.preserve_env ) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers ) return thermos_executor
def read(args, options): """Replay a thermos checkpoint. Usage: thermos read [options] checkpoint_filename Options: --simple Do not replay the full task state machine. Only print out the contents of each checkpoint log message. """ if len(args) != 1: app.error('Expected one checkpoint file, got %s' % len(args)) if not os.path.exists(args[0]): app.error('Could not find %s' % args[0]) dispatcher = CheckpointDispatcher() state = RunnerState(processes={}) with open(args[0], 'r') as fp: try: for record in ThriftRecordReader(fp, RunnerCkpt): if not options.simple: dispatcher.dispatch(state, record) else: print('CKPT: %s' % record) except RecordIO.Error as err: print("Failed to recover from %s: %s" % (fp.name, err)) return if not options.simple: if state is None or state.header is None: print('Checkpoint stream CORRUPT or outdated format') return print('Recovered Task Header:') print(' id: %s' % state.header.task_id) print(' user: %s' % state.header.user) print(' host: %s' % state.header.hostname) print(' sandbox: %s' % state.header.sandbox) if state.header.ports: print(' ports: %s' % ' '.join('%s->%s' % (name, port) for (name, port) in state.header.ports.items())) print('Recovered Task States:') for task_status in state.statuses: print( ' %s [pid: %d] => %s' % (time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)), task_status.runner_pid, TaskState._VALUES_TO_NAMES[task_status.state])) print('Recovered Processes:') for process, process_history in state.processes.items(): print(' %s runs: %s' % (process, len(process_history))) for k in reversed(range(len(process_history))): run = process_history[k] print( ' %2d: pid=%d, rc=%s, finish:%s, state:%s' % (k, run.pid, run.return_code if run.return_code is not None else '', time.asctime(time.localtime(run.stop_time)) if run.stop_time else 'None', ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))
def initialize(options): cwd_path = os.path.abspath(CWD) checkpoint_root = os.path.join(cwd_path, MesosPathDetector.DEFAULT_SANDBOX_PATH) # status providers: status_providers = [ HealthCheckerProvider(), ResourceManagerProvider(checkpoint_root=checkpoint_root) ] if options.announcer_enable: if options.announcer_ensemble is None: app.error( 'Must specify --announcer-ensemble if the announcer is enabled.' ) status_providers.append( DefaultAnnouncerCheckerProvider( options.announcer_ensemble, options.announcer_serverset_path, options.announcer_allow_custom_serverset_path)) # Create executor stub if options.execute_as_user or options.nosetuid: # If nosetuid is set, execute_as_user is also None thermos_runner_provider = UserOverrideThermosTaskRunnerProvider( dump_runner_pex(), checkpoint_root, artifact_dir=cwd_path, process_logger_destination=options.runner_logger_destination, process_logger_mode=options.runner_logger_mode, rotate_log_size_mb=options.runner_rotate_log_size_mb, rotate_log_backups=options.runner_rotate_log_backups, preserve_env=options.preserve_env) thermos_runner_provider.set_role(None) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers, sandbox_provider=UserOverrideDirectorySandboxProvider( options.execute_as_user)) else: thermos_runner_provider = DefaultThermosTaskRunnerProvider( dump_runner_pex(), checkpoint_root, artifact_dir=cwd_path, process_logger_destination=options.runner_logger_destination, process_logger_mode=options.runner_logger_mode, rotate_log_size_mb=options.runner_rotate_log_size_mb, rotate_log_backups=options.runner_rotate_log_backups, preserve_env=options.preserve_env) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers) return thermos_executor
def read(args, options): """Replay a thermos checkpoint. Usage: thermos read [options] checkpoint_filename Options: --simple Do not replay the full task state machine. Only print out the contents of each checkpoint log message. """ if len(args) != 1: app.error('Expected one checkpoint file, got %s' % len(args)) if not os.path.exists(args[0]): app.error('Could not find %s' % args[0]) dispatcher = CheckpointDispatcher() state = RunnerState(processes={}) with open(args[0], 'r') as fp: try: for record in ThriftRecordReader(fp, RunnerCkpt): if not options.simple: dispatcher.dispatch(state, record) else: print('CKPT: %s' % record) except RecordIO.Error as err: print("Failed to recover from %s: %s" % (fp.name, err)) return if not options.simple: if state is None or state.header is None: print('Checkpoint stream CORRUPT or outdated format') return print('Recovered Task Header:') print(' id: %s' % state.header.task_id) print(' user: %s' % state.header.user) print(' host: %s' % state.header.hostname) print(' sandbox: %s' % state.header.sandbox) if state.header.ports: print(' ports: %s' % ' '.join( '%s->%s' % (name, port) for (name, port) in state.header.ports.items())) print('Recovered Task States:') for task_status in state.statuses: print(' %s [pid: %d] => %s' % ( time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)), task_status.runner_pid, TaskState._VALUES_TO_NAMES[task_status.state])) print('Recovered Processes:') for process, process_history in state.processes.items(): print(' %s runs: %s' % (process, len(process_history))) for k in reversed(range(len(process_history))): run = process_history[k] print(' %2d: pid=%d, rc=%s, finish:%s, state:%s' % ( k, run.pid, run.return_code if run.return_code is not None else '', time.asctime(time.localtime(run.stop_time)) if run.stop_time else 'None', ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))
def main(): if MesosExecutorDriver is None: app.error('Could not load MesosExecutorDriver!') thermos_gc_executor, metric_writer, driver = initialize() thermos_gc_executor.start() metric_writer.start() driver.run() log.info('MesosExecutorDriver.run() has finished.')
def pid_provider(): options = app.get_options() for path, _, pid in list_pids(): if pid == options.pid: break else: app.error('Could not find pid %s' % options.pid) def loader(): with open(path, 'rb') as fp: return fp.read() return loader
def main(args, options): if MesosExecutorDriver is None: app.error('Could not load MesosExecutorDriver!') # status providers: status_providers = [ HealthCheckerProvider(), ResourceManagerProvider(checkpoint_root=options.checkpoint_root) ] if options.announcer_enable: if options.announcer_ensemble is None: app.error('Must specify --announcer-ensemble if the announcer is enabled.') status_providers.append(DefaultAnnouncerCheckerProvider( options.announcer_ensemble, options.announcer_serverset_path)) # Create executor stub if options.execute_as_user or options.nosetuid: # If nosetuid is set, execute_as_user is also None thermos_runner_provider = UserOverrideThermosTaskRunnerProvider( dump_runner_pex(), artifact_dir=os.path.abspath(CWD) ) thermos_runner_provider.set_role(None) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers, sandbox_provider=UserOverrideDirectorySandboxProvider(options.execute_as_user) ) else: thermos_runner_provider = DefaultThermosTaskRunnerProvider( dump_runner_pex(), artifact_dir=os.path.abspath(CWD) ) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers ) # Create driver stub driver = MesosExecutorDriver(thermos_executor) # This is an ephemeral executor -- shutdown if we receive no tasks within a certain # time period ExecutorTimeout(thermos_executor.launched, driver).start() # Start executor driver.run() log.info('MesosExecutorDriver.run() has finished.')
def main(args, options): if MesosExecutorDriver is None: app.error('Could not load MesosExecutorDriver!') # status providers: status_providers = [ HealthCheckerProvider(), ResourceManagerProvider(checkpoint_root=options.checkpoint_root) ] if options.announcer_enable: if options.announcer_ensemble is None: app.error( 'Must specify --announcer-ensemble if the announcer is enabled.' ) status_providers.append( DefaultAnnouncerCheckerProvider( options.announcer_ensemble, options.announcer_serverset_path)) # Create executor stub if options.execute_as_user or options.nosetuid: # If nosetuid is set, execute_as_user is also None thermos_runner_provider = UserOverrideThermosTaskRunnerProvider( dump_runner_pex(), artifact_dir=os.path.abspath(CWD)) thermos_runner_provider.set_role(None) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers, sandbox_provider=UserOverrideDirectorySandboxProvider( options.execute_as_user)) else: thermos_runner_provider = DefaultThermosTaskRunnerProvider( dump_runner_pex(), artifact_dir=os.path.abspath(CWD)) thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers) # Create driver stub driver = MesosExecutorDriver(thermos_executor) # This is an ephemeral executor -- shutdown if we receive no tasks within a certain # time period ExecutorTimeout(thermos_executor.launched, driver).start() # Start executor driver.run() log.info('MesosExecutorDriver.run() has finished.')
def main(args): if len(args) != 1: app.error('Must supply a serverset path to monitor.') def on_join(endpoint): print('@ %s += %s' % (datetime.now(), endpoint)) def on_leave(endpoint): print('@ %s -= %s' % (datetime.now(), endpoint)) ss = ServerSet(ZooKeeper(), args[0], on_join=on_join, on_leave=on_leave) while True: time.sleep(100)
def to_acl(access): cred = access.credential().get() if access.scheme().get() == 'digest': cred_parts = access.credential().get().split(':') if len(cred_parts) != 2: app.error('Digest credential should be of the form <user>:<password>') cred = make_digest_acl_credential(cred_parts[0], cred_parts[1]) return make_acl(access.scheme().get(), cred, read=access.permissions().read().get(), write=access.permissions().write().get(), create=access.permissions().create().get(), delete=access.permissions().delete().get(), admin=access.permissions().admin().get())
def proxy_main(args, opts): assert opts.thermos_json and os.path.exists(opts.thermos_json) assert opts.sandbox assert opts.checkpoint_root thermos_task = get_task_from_options(opts) prebound_ports = opts.prebound_ports missing_ports = set(thermos_task.ports()) - set(prebound_ports) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner(thermos_task.task, opts.checkpoint_root, opts.sandbox, task_id=opts.task_id, user=opts.setuid, portmap=prebound_ports, chroot=opts.chroot, planner_class=CappedTaskPlanner) for sig in (signal.SIGUSR1, signal.SIGUSR2): signal.signal(sig, functools.partial(runner_teardown, task_runner)) try: task_runner.run() except TaskRunner.InternalError as err: app.error('Internal error: %s' % err) except TaskRunner.InvalidTask as err: app.error(str(err)) except TaskRunner.StateError: app.error('Task appears to already be in a terminal state.') except KeyboardInterrupt: runner_teardown(task_runner)
def main(args): if len(args) != 1: app.error("Must supply a serverset path to monitor.") def on_join(endpoint): print("@ %s += %s" % (datetime.now(), endpoint)) def on_leave(endpoint): print("@ %s -= %s" % (datetime.now(), endpoint)) ss = ServerSet(ZooKeeper(), args[0], on_join=on_join, on_leave=on_leave) while True: time.sleep(100)
def to_acl(access): cred = access.credential().get() if access.scheme().get() == 'digest': cred_parts = access.credential().get().split(':') if len(cred_parts) != 2: app.error( 'Digest credential should be of the form <user>:<password>') cred = make_digest_acl_credential(cred_parts[0], cred_parts[1]) return make_acl(access.scheme().get(), cred, read=access.permissions().read().get(), write=access.permissions().write().get(), create=access.permissions().create().get(), delete=access.permissions().delete().get(), admin=access.permissions().admin().get())
def main(args, options): if MesosExecutorDriver is None: app.error('Could not load MesosExecutorDriver!') thermos_executor = initialize(options) # Create driver stub driver = MesosExecutorDriver(thermos_executor) # This is an ephemeral executor -- shutdown if we receive no tasks within a certain # time period ExecutorTimeout(thermos_executor.launched, driver).start() # Start executor driver.run() log.info('MesosExecutorDriver.run() has finished.')
def main(args, options): if not args: app.error('expected at least one ServerSet endpoint') def changed(endpoint, old, new): print '%s changed:' % endpoint print ' old:', _format_instances(old) print ' new:', _format_instances(new) print print 'Watching ServerSet endpoints. Hit ^C to exit.' print endpoints = [] for arg in args: endpoints.append(ServerSetClient(arg, watcher=changed)) while True: raw_input()
def main(args, options): from pkg_resources import WorkingSet, Requirement, find_distributions if not options.site_dir: app.error('Must supply --site') distributions = list(find_distributions(options.site_dir)) working_set = WorkingSet() for dist in distributions: working_set.add(dist) for arg in args: arg_req = Requirement.parse(arg) found_dist = working_set.find(arg_req) if not found_dist: print('Could not find %s!' % arg_req) out_zip = Distiller(found_dist).distill() print('Dumped %s => %s' % (arg_req, out_zip))
def main(): if MesosExecutorDriver is None: app.error('Could not load MesosExecutorDriver!') # Create executor stub thermos_gc_executor = ThermosGCExecutor(FixedPathDetector(DEFAULT_CHECKPOINT_ROOT)) thermos_gc_executor.start() # Start metrics collection metric_writer = DiskMetricWriter(thermos_gc_executor.metrics, ExecutorDetector.VARS_PATH) metric_writer.start() # Create driver stub driver = MesosExecutorDriver(thermos_gc_executor) # Start GC executor driver.run() log.info('MesosExecutorDriver.run() has finished.')
def main(args, options): if len(args) > 0: app.error('Must provide hsperfdata via -f/-p') if options.list: print_pids() return perfdata = None if options.filename: perfdata = PerfData.get(file_provider()) elif options.pid: perfdata = PerfData.get(pid_provider()) if perfdata is None: app.error('No hsperfdata provider specified!') perfdata.sample() for key in sorted(perfdata): print('%s: %s' % (key, perfdata[key]))
def validate_common_options(options): if not options.api_host: app.error("--api_host is required") if not options.api_port: app.error("--api_port is required") if not options.cluster_name: app.error("--cluster is required") if not options.password_file: app.error("--password_file is required") log.info("Using --password_file=%s" % options.password_file) safe_mkdir(os.path.dirname(options.password_file))
def _really_run(task, root, sandbox, task_id=None, user=None, prebound_ports=None, chroot=None, daemon=False): prebound_ports = prebound_ports or {} missing_ports = set(task.ports()) - set(prebound_ports.keys()) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner(task.task, root, sandbox, task_id=task_id, user=user, portmap=prebound_ports, chroot=chroot) if daemon: print('Daemonizing and starting runner.') try: log.teardown_stderr_logging() daemonize() except Exception as e: print("Failed to daemonize: %s" % e) sys.exit(1) try: task_runner.run() except KeyboardInterrupt: print('Got keyboard interrupt, killing job!') task_runner.close_ckpt() task_runner.kill()
def delete(args, options): validate_common_options(options) with open(options.password_file, 'r') as f: password = f.read().strip() if not password: app.error("Empty password file") url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) values = dict(password=password) req = urllib2.Request(url, urllib.urlencode(values)) req.get_method = lambda: 'DELETE' try: response = urllib2.urlopen(req).read() except urllib2.HTTPError as e: log.error("DELETE request failed: %s, %s, %s" % (e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) app.quit(1) try: result = json.loads(response) if not isinstance(result, dict): raise ValueError() except ValueError: log.error("Invalid response: %s" % response) app.quit(1) log.info("Cluster deletion result: %s" % result) log.info("Waiting for the cluster to terminate...") wait_for_termination(result['cluster_url']) log.info("Cluster terminated/deleted")
def main(args, options): thermos_runner_provider = DefaultThermosTaskRunnerProvider( dump_runner_pex(), artifact_dir=os.path.realpath('.'), ) # status providers: status_providers = [HealthCheckerProvider()] if options.announcer_enable: if options.announcer_ensemble is None: app.error( 'Must specify --announcer-ensemble if the announcer is enabled.' ) status_providers.append( DefaultAnnouncerCheckerProvider( options.announcer_ensemble, options.announcer_serverset_path)) # Create executor stub thermos_executor = AuroraExecutor( runner_provider=thermos_runner_provider, status_providers=status_providers, ) # Create driver stub driver = MesosExecutorDriver(thermos_executor) # This is an ephemeral executor -- shutdown if we receive no tasks within a certain # time period ExecutorTimeout(thermos_executor.launched, driver).start() # Start executor driver.run() log.info('MesosExecutorDriver.run() has finished.')
def main(args, options): if MesosExecutorDriver is None: app.error('Could not load MesosExecutorDriver!') thermos_executor = initialize(options) # Create driver stub driver = MesosExecutorDriver(thermos_executor) # This is an ephemeral executor -- shutdown if we receive no tasks within a certain # time period ExecutorTimeout(thermos_executor.launched, driver).start() # Start executor and wait until it is stopped. driver_thread = ExecutorDriverThread(driver) driver_thread.start() try: while driver_thread.isAlive(): driver_thread.join(5) except (KeyboardInterrupt, SystemExit): driver.stop() raise log.info('MesosExecutorDriver.run() has finished.')
def get_task_from_options(opts): tasks = ThermosConfigLoader.load_json(opts.thermos_json) if len(tasks.tasks()) == 0: app.error("No tasks specified!") if len(tasks.tasks()) > 1: app.error("Multiple tasks in config but no task name specified!") task = tasks.tasks()[0] if not task.task.check().ok(): app.error(task.task.check().message()) return task
def make_zk_auth(zk_auth_config): if zk_auth_config is None: return None try: with open(zk_auth_config) as fp: try: zk_auth = ZkAuth.json_load(fp, strict=True) if not zk_auth.check().ok(): app.error('ZK authentication config is invalid %s' % zk_auth.check().message()) return zk_auth except (TypeError, ValueError, AttributeError) as ex: app.error('Problem parsing ZK authentication config %s' % ex) except IOError as ie: app.error('Failed to open config file %s' % ie)
def proxy_main(args, opts): assert opts.thermos_json and os.path.exists(opts.thermos_json) assert opts.sandbox assert opts.checkpoint_root thermos_task = get_task_from_options(opts) prebound_ports = opts.prebound_ports missing_ports = set(thermos_task.ports()) - set(prebound_ports) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner( thermos_task.task, opts.checkpoint_root, opts.sandbox, task_id=opts.task_id, user=opts.setuid, portmap=prebound_ports, chroot=opts.chroot, planner_class=CappedTaskPlanner ) for sig in (signal.SIGUSR1, signal.SIGUSR2): signal.signal(sig, functools.partial(runner_teardown, task_runner)) try: task_runner.run() except TaskRunner.InternalError as err: app.error('Internal error: %s' % err) except TaskRunner.InvalidTask as err: app.error(str(err)) except TaskRunner.StateError: app.error('Task appears to already be in a terminal state.') except KeyboardInterrupt: runner_teardown(task_runner)
def main(args, options): log.info("Options in use: %s", options) if not options.api_port: app.error('Must specify --port') if not options.mesos_master: app.error('Must specify --mesos_master') if not options.framework_user: app.error('Must specify --framework_user') if not options.executor_uri: app.error('Must specify --executor_uri') if not options.executor_cmd: app.error('Must specify --executor_cmd') if not options.zk_url: app.error('Must specify --zk_url') if not options.admin_keypath: app.error('Must specify --admin_keypath') try: election_timeout = parse_time(options.election_timeout) framework_failover_timeout = parse_time( options.framework_failover_timeout) except InvalidTime as e: app.error(e.message) try: _, zk_servers, zk_root = zookeeper.parse(options.zk_url) except Exception as e: app.error("Invalid --zk_url: %s" % e.message) web_assets_dir = os.path.join(options.work_dir, "web") pkgutil.unpack_assets(web_assets_dir, MYSOS_MODULE, ASSET_RELPATH) log.info("Extracted web assets into %s" % options.work_dir) fw_principal = None fw_secret = None if options.framework_authentication_file: try: with open(options.framework_authentication_file, "r") as f: cred = yaml.load(f) fw_principal = cred["principal"] fw_secret = cred["secret"] log.info( "Loaded credential (principal=%s) for framework authentication" % fw_principal) except IOError as e: app.error( "Unable to read the framework authentication key file: %s" % e) except (KeyError, yaml.YAMLError) as e: app.error( "Invalid framework authentication key file format %s" % e) log.info("Starting Mysos scheduler") kazoo = KazooClient(zk_servers) kazoo.start() if options.state_storage == 'zk': log.info("Using ZooKeeper (path: %s) for state storage" % zk_root) state_provider = ZooKeeperStateProvider(kazoo, zk_root) else: log.info("Using local disk for state storage") state_provider = LocalStateProvider(options.work_dir) try: state = state_provider.load_scheduler_state() except StateProvider.Error as e: app.error(e.message) if state: log.info("Successfully restored scheduler state") framework_info = state.framework_info if framework_info.HasField('id'): log.info("Recovered scheduler's FrameworkID is %s" % framework_info.id.value) else: log.info("No scheduler state to restore") framework_info = FrameworkInfo( user=options.framework_user, name=FRAMEWORK_NAME, checkpoint=True, failover_timeout=framework_failover_timeout.as_(Time.SECONDS), role=options.framework_role) if fw_principal: framework_info.principal = fw_principal state = Scheduler(framework_info) state_provider.dump_scheduler_state(state) scheduler = MysosScheduler(state, state_provider, options.framework_user, options.executor_uri, options.executor_cmd, kazoo, options.zk_url, election_timeout, options.admin_keypath, installer_args=options.installer_args, backup_store_args=options.backup_store_args, executor_environ=options.executor_environ, framework_role=options.framework_role) if fw_principal and fw_secret: cred = Credential(principal=fw_principal, secret=fw_secret) scheduler_driver = mesos.native.MesosSchedulerDriver( scheduler, framework_info, options.mesos_master, cred) else: scheduler_driver = mesos.native.MesosSchedulerDriver( scheduler, framework_info, options.mesos_master) scheduler_driver.start() server = HttpServer() server.mount_routes(MysosServer(scheduler, web_assets_dir)) et = ExceptionalThread(target=server.run, args=('0.0.0.0', options.api_port, 'cherrypy')) et.daemon = True et.start() try: # Wait for the scheduler to stop. # The use of 'stopped' event instead of scheduler_driver.join() is necessary to stop the # process with SIGINT. while not scheduler.stopped.wait(timeout=0.5): pass except KeyboardInterrupt: log.info('Interrupted, exiting.') else: log.info('Scheduler exited.') app.shutdown( 1 ) # Mysos scheduler is supposed to be long-running thus the use of exit status 1.
def create(args, options): validate_common_options(options) if not options.num_nodes: app.error("--num_nodes is required") if not options.cluster_user: app.error("--cluster_user is required") url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) values = dict( num_nodes=int(options.num_nodes), cluster_user=options.cluster_user, size=options.size if options.size else '', # 'urlencode()' doesn't accept None. backup_id=options.backup_id if options.backup_id else '', cluster_password=options.cluster_password if options.cluster_password else '') req = urllib2.Request(url, urllib.urlencode(values)) try: response = urllib2.urlopen(req).read() except urllib2.HTTPError as e: log.error("POST request failed: %s, %s, %s" % (e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) app.quit(1) try: result = json.loads(response) if not isinstance(result, dict): raise ValueError() except ValueError: log.error("Invalid response: %s" % response) app.quit(1) log.info("Cluster created. Cluster info: %s" % str(result)) with open(options.password_file, 'w') as f: f.write(result["cluster_password"]) log.info("Waiting for the master for this cluster to be elected...") master_endpoint = wait_for_master( result['cluster_url']).service_endpoint connection_str = "mysql://%s:%s@%s:%d/" % ( options.cluster_user, result["cluster_password"], master_endpoint.host, master_endpoint.port) log.info("Connecting to the MySQL cluster master: %s" % connection_str) engine = create_engine(connection_str) for i in range( 5 ): # Loop for 5 times/seconds to wait for the master to be promoted. try: # TODO(jyx): Test writing to the master and reading from the slave. result = engine.execute("SELECT 1;").scalar() assert 1 == int( result), "Expecting result to be 1 but got %s" % result break except OperationalError: if i == 4: raise log.debug("MySQL master not ready yet. Sleep for 1 second...") time.sleep(1) log.info("Cluster successfully started")
def main(args, options): log.info("Options in use: %s", options) if not options.api_port: app.error('Must specify --port') if not options.mesos_master: app.error('Must specify --mesos_master') if not options.framework_user: app.error('Must specify --framework_user') if not options.executor_uri: app.error('Must specify --executor_uri') if not options.executor_cmd: app.error('Must specify --executor_cmd') if not options.zk_url: app.error('Must specify --zk_url') if not options.admin_keypath: app.error('Must specify --admin_keypath') try: election_timeout = parse_time(options.election_timeout) framework_failover_timeout = parse_time(options.framework_failover_timeout) except InvalidTime as e: app.error(e.message) try: _, zk_servers, zk_root = zookeeper.parse(options.zk_url) except Exception as e: app.error("Invalid --zk_url: %s" % e.message) web_assets_dir = os.path.join(options.work_dir, "web") pkgutil.unpack_assets(web_assets_dir, MYSOS_MODULE, ASSET_RELPATH) log.info("Extracted web assets into %s" % options.work_dir) fw_principal = None fw_secret = None if options.framework_authentication_file: try: with open(options.framework_authentication_file, "r") as f: cred = yaml.load(f) fw_principal = cred["principal"] fw_secret = cred["secret"] log.info("Loaded credential (principal=%s) for framework authentication" % fw_principal) except IOError as e: app.error("Unable to read the framework authentication key file: %s" % e) except (KeyError, yaml.YAMLError) as e: app.error("Invalid framework authentication key file format %s" % e) log.info("Starting Mysos scheduler") kazoo = KazooClient(zk_servers) kazoo.start() if options.state_storage == 'zk': log.info("Using ZooKeeper (path: %s) for state storage" % zk_root) state_provider = ZooKeeperStateProvider(kazoo, zk_root) else: log.info("Using local disk for state storage") state_provider = LocalStateProvider(options.work_dir) try: state = state_provider.load_scheduler_state() except StateProvider.Error as e: app.error(e.message) if state: log.info("Successfully restored scheduler state") framework_info = state.framework_info if framework_info.HasField('id'): log.info("Recovered scheduler's FrameworkID is %s" % framework_info.id.value) else: log.info("No scheduler state to restore") framework_info = FrameworkInfo( user=options.framework_user, name=FRAMEWORK_NAME, checkpoint=True, failover_timeout=framework_failover_timeout.as_(Time.SECONDS), role=options.framework_role) if fw_principal: framework_info.principal = fw_principal state = Scheduler(framework_info) state_provider.dump_scheduler_state(state) scheduler = MysosScheduler( state, state_provider, options.framework_user, options.executor_uri, options.executor_cmd, kazoo, options.zk_url, election_timeout, options.admin_keypath, installer_args=options.installer_args, backup_store_args=options.backup_store_args, executor_environ=options.executor_environ, framework_role=options.framework_role) if fw_principal and fw_secret: cred = Credential(principal=fw_principal, secret=fw_secret) scheduler_driver = mesos.native.MesosSchedulerDriver( scheduler, framework_info, options.mesos_master, cred) else: scheduler_driver = mesos.native.MesosSchedulerDriver( scheduler, framework_info, options.mesos_master) scheduler_driver.start() server = HttpServer() server.mount_routes(MysosServer(scheduler, web_assets_dir)) et = ExceptionalThread( target=server.run, args=('0.0.0.0', options.api_port, 'cherrypy')) et.daemon = True et.start() try: # Wait for the scheduler to stop. # The use of 'stopped' event instead of scheduler_driver.join() is necessary to stop the # process with SIGINT. while not scheduler.stopped.wait(timeout=0.5): pass except KeyboardInterrupt: log.info('Interrupted, exiting.') else: log.info('Scheduler exited.') app.shutdown(1) # Mysos scheduler is supposed to be long-running thus the use of exit status 1.