def main(): namespace = parse_job_mode() if namespace.jmx_username: namespace.jmx_password = getpass.getpass(prompt="JMX Password ") else: namespace.jmx_password = None if bool(namespace.seed_host) + bool(namespace.host) + bool( namespace.host_file) != 1: error( "Exactly one of --seed-host, --host and --host-file must be used", print_traceback=False) hosts = None if namespace.host_file: with open(namespace.host_file) as f: hosts = f.readlines() if namespace.host: hosts = namespace.host cstar.output.configure(namespace.verbose) with cstar.job.Job() as job: env = {} job_id = str(uuid.uuid4()) msg("Job id is", emph(job_id)) cstar.signalhandler.print_message_and_save_on_sigint(job, job_id) job.setup(hosts=hosts, seeds=namespace.seed_host, command=namespace.command, job_id=job_id, strategy=cstar.strategy.parse( fallback(namespace.strategy, "topology")), cluster_parallel=fallback(namespace.cluster_parallel, False), dc_parallel=fallback(namespace.dc_parallel, False), max_concurrency=namespace.max_concurrency, timeout=namespace.timeout, env=env, stop_after=namespace.stop_after, job_runner=cstar.jobrunner.LocalJobRunner, key_space=namespace.key_space, output_directory=namespace.output_directory, ignore_down_nodes=False, dc_filter=namespace.dc_filter, sleep_on_new_runner=namespace.ssh_pause_time, sleep_after_done=namespace.node_done_pause_time, ssh_username=namespace.ssh_username, ssh_password=namespace.ssh_password, ssh_identity_file=namespace.ssh_identity_file, ssh_lib=namespace.ssh_lib, jmx_username=namespace.jmx_username, jmx_password=namespace.jmx_password, resolve_hostnames=namespace.resolve_hostnames, hosts_variables=namespace.hosts_variables) job.run()
def execute_command(args): cstar.output.debug(args) command = args.command if bool(args.seed_host) + bool(args.host) + bool(args.host_file) != 1: error( "Exactly one of --seed-host, --host and --host-file must be used", print_traceback=False) hosts = None if args.host_file: with open(args.host_file) as f: hosts = f.readlines() if args.host: hosts = args.host with cstar.job.Job() as job: env = dict( (arg.name, getattr(args, arg.name)) for arg in command.arguments) if bool(args.enforced_job_id) == 1: job_id = args.enforced_job_id if not (validate_uuid4(job_id)): raise BadArgument("Job id is not a valid UUID v4 value.") else: job_id = str(uuid.uuid4()) msg("Job id is", emph(job_id)) msg("Running", command.file) cstar.signalhandler.print_message_and_save_on_sigint(job, job_id) job.setup(hosts=hosts, seeds=args.seed_host, command=command.file, job_id=job_id, strategy=cstar.strategy.parse( fallback(args.strategy, command.strategy, "topology")), cluster_parallel=fallback(args.cluster_parallel, command.cluster_parallel, False), dc_parallel=fallback(args.dc_parallel, command.dc_parallel, False), max_concurrency=args.max_concurrency, timeout=args.timeout, env=env, stop_after=args.stop_after, job_runner=cstar.jobrunner.RemoteJobRunner, key_space=args.key_space, output_directory=args.output_directory, ignore_down_nodes=args.ignore_down_nodes, dc_filter=args.dc_filter, sleep_on_new_runner=args.ssh_pause_time, sleep_after_done=args.node_done_pause_time, ssh_username=args.ssh_username, ssh_password=args.ssh_password, ssh_identity_file=args.ssh_identity_file, ssh_lib=args.ssh_lib, jmx_username=args.jmx_username, jmx_password=args.jmx_password, jmx_passwordfile=args.jmx_passwordfile) job.run()
def execute_continue(args): with cstar.job.Job() as job: try: cstar.jobreader.read(job, args.job_id, args.stop_after, max_days=args.max_job_age, output_directory=args.output_directory) except (FileTooOld, BadFileFormatVersion) as e: error(e) msg("Resuming job", job.job_id) msg("Running ", job.command) cstar.signalhandler.print_message_and_save_on_sigint(job, job.job_id) job.resume()
def cleanup(max_days, listdir=os.listdir, jobread=cstar.jobreader.read, delete=shutil.rmtree): job_dir = os.path.expanduser('~/.cstar/jobs') for job_id in listdir(job_dir): try: jobread(cstar.job.Job(), job_id, stop_after=None, max_days=max_days, endpoint_mapper=lambda x: None) except Exception: msg("Removing job", job_id) full_name = os.path.join(job_dir, job_id) delete(full_name)
def handle_finished_jobs(self, finished_jobs): debug("Processing ", len(finished_jobs), " finished jobs") for finished_job in finished_jobs: host = finished_job[0] result = finished_job[1] if result.status != 0: self.errors.append((host, result)) self.state = self.state.with_failed(host) msg("Failure on host", host.fqdn) if result.out: msg("stdout:", result.out) if result.err: msg("stderr:", result.err) self.do_loop = False else: self.state = self.state.with_done(host) info("Host %s finished successfully" % (host.fqdn, )) if result.out: info("stdout:", result.out, sep="\n") if result.err: info("stderr:", result.err) if self.sleep_after_done: debug("Sleeping %d seconds..." % self.sleep_after_done) time.sleep(self.sleep_after_done) cstar.jobwriter.write(self) # Signal the jobrunner that it can delete the remote job files and terminate. for finished_job in finished_jobs: host, result = finished_job self.handled_finished_jobs.add(host)
def print_outcome(self): if self.state.is_done() and not self.errors: if len(self.state.progress.done) == self.state.stop_after: cstar.jobwriter.write(self) msg("Job", self.job_id, "successfully ran on", self.state.stop_after, "hosts.\nTo finish the job, run", emph("cstar continue %s" % (self.job_id,))) msg("Job", self.job_id, "finished successfully") else: msg("Job", self.job_id, "finished with errors.\n" "%s nodes finished successfully\n" "%s nodes had errors\n" "%s nodes didn't start executing" % (len(self.state.progress.done), len(self.state.progress.failed), len(self.state.original_topology) - len(self.state.progress.done) - len(self.state.progress.failed)))
def execute_continue(args): msg("Retry : ", args.retry_failed) with cstar.job.Job() as job: try: cstar.jobreader.read(job, args.job_id, args.stop_after, max_days=args.max_job_age, output_directory=args.output_directory, retry=args.retry_failed) except (FileTooOld, BadFileFormatVersion) as e: error(e) msg("Resuming job", job.job_id) if job.jmx_username and not job.jmx_passwordfile: job.jmx_password = getpass.getpass(prompt="JMX Password ") msg("Running ", job.command) cstar.signalhandler.print_message_and_save_on_sigint(job, job.job_id) job.resume()
def setup(self, hosts, seeds, command, job_id, strategy, cluster_parallel, dc_parallel, job_runner, max_concurrency, timeout, env, stop_after, key_space, output_directory, ignore_down_nodes, dc_filter, sleep_on_new_runner, sleep_after_done, ssh_username, ssh_password, ssh_identity_file): msg("Starting setup") msg("Strategy:", cstar.strategy.serialize(strategy)) msg("DC parallel:", dc_parallel) msg("Cluster parallel:", cluster_parallel) self.command = command self.job_id = job_id self.timeout = timeout self.env = env self.job_runner = job_runner self.key_space = key_space self.output_directory = output_directory or os.path.expanduser( "~/.cstar/jobs/" + job_id) self.sleep_on_new_runner = sleep_on_new_runner self.sleep_after_done = sleep_after_done self.ssh_username = ssh_username self.ssh_password = ssh_password self.ssh_identity_file = ssh_identity_file if not os.path.exists(self.output_directory): os.makedirs(self.output_directory) msg("Loading cluster topology") if seeds: current_topology = cstar.topology.Topology([]) for seed in seeds: current_topology = current_topology | self.get_cluster_topology( (seed, )) original_topology = current_topology if dc_filter: original_topology = original_topology.with_dc(dc_filter) else: current_topology = cstar.topology.Topology() hosts_ip_set = set(socket.gethostbyname(host) for host in hosts) for raw_host in hosts: host = socket.gethostbyname(raw_host) if host in current_topology: continue current_topology = current_topology | self.get_cluster_topology( (host, )) original_topology = cstar.topology.Topology( host for host in current_topology if host.ip in hosts_ip_set) msg("Done loading cluster topology") debug("Run on hosts", original_topology) debug("in topology", current_topology) msg("Generating endpoint mapping") if strategy is cstar.strategy.Strategy.TOPOLOGY: endpoint_mapping = self.get_endpoint_mapping(current_topology) msg("Done generating endpoint mapping") else: endpoint_mapping = None msg("Skipping endpoint mapping because of selected strategy") self.state = cstar.state.State(original_topology, strategy, endpoint_mapping, cluster_parallel, dc_parallel, max_concurrency, current_topology=current_topology, stop_after=stop_after, ignore_down_nodes=ignore_down_nodes) msg("Setup done")
def execute_cleanup(args): msg('Cleaning up old jobs') cstar.cleanup.cleanup(args.max_job_age)