Exemple #1
0
def main():
    namespace = parse_job_mode()

    if namespace.jmx_username:
        namespace.jmx_password = getpass.getpass(prompt="JMX Password ")
    else:
        namespace.jmx_password = None

    if bool(namespace.seed_host) + bool(namespace.host) + bool(
            namespace.host_file) != 1:
        error(
            "Exactly one of --seed-host, --host and --host-file must be used",
            print_traceback=False)

    hosts = None

    if namespace.host_file:
        with open(namespace.host_file) as f:
            hosts = f.readlines()

    if namespace.host:
        hosts = namespace.host

    cstar.output.configure(namespace.verbose)

    with cstar.job.Job() as job:
        env = {}
        job_id = str(uuid.uuid4())
        msg("Job id is", emph(job_id))

        cstar.signalhandler.print_message_and_save_on_sigint(job, job_id)

        job.setup(hosts=hosts,
                  seeds=namespace.seed_host,
                  command=namespace.command,
                  job_id=job_id,
                  strategy=cstar.strategy.parse(
                      fallback(namespace.strategy, "topology")),
                  cluster_parallel=fallback(namespace.cluster_parallel, False),
                  dc_parallel=fallback(namespace.dc_parallel, False),
                  max_concurrency=namespace.max_concurrency,
                  timeout=namespace.timeout,
                  env=env,
                  stop_after=namespace.stop_after,
                  job_runner=cstar.jobrunner.LocalJobRunner,
                  key_space=namespace.key_space,
                  output_directory=namespace.output_directory,
                  ignore_down_nodes=False,
                  dc_filter=namespace.dc_filter,
                  sleep_on_new_runner=namespace.ssh_pause_time,
                  sleep_after_done=namespace.node_done_pause_time,
                  ssh_username=namespace.ssh_username,
                  ssh_password=namespace.ssh_password,
                  ssh_identity_file=namespace.ssh_identity_file,
                  ssh_lib=namespace.ssh_lib,
                  jmx_username=namespace.jmx_username,
                  jmx_password=namespace.jmx_password,
                  resolve_hostnames=namespace.resolve_hostnames,
                  hosts_variables=namespace.hosts_variables)
        job.run()
Exemple #2
0
def execute_command(args):
    cstar.output.debug(args)
    command = args.command
    if bool(args.seed_host) + bool(args.host) + bool(args.host_file) != 1:
        error(
            "Exactly one of --seed-host, --host and --host-file must be used",
            print_traceback=False)

    hosts = None

    if args.host_file:
        with open(args.host_file) as f:
            hosts = f.readlines()

    if args.host:
        hosts = args.host

    with cstar.job.Job() as job:
        env = dict(
            (arg.name, getattr(args, arg.name)) for arg in command.arguments)
        if bool(args.enforced_job_id) == 1:
            job_id = args.enforced_job_id
            if not (validate_uuid4(job_id)):
                raise BadArgument("Job id is not a valid UUID v4 value.")
        else:
            job_id = str(uuid.uuid4())
        msg("Job id is", emph(job_id))
        msg("Running", command.file)

        cstar.signalhandler.print_message_and_save_on_sigint(job, job_id)

        job.setup(hosts=hosts,
                  seeds=args.seed_host,
                  command=command.file,
                  job_id=job_id,
                  strategy=cstar.strategy.parse(
                      fallback(args.strategy, command.strategy, "topology")),
                  cluster_parallel=fallback(args.cluster_parallel,
                                            command.cluster_parallel, False),
                  dc_parallel=fallback(args.dc_parallel, command.dc_parallel,
                                       False),
                  max_concurrency=args.max_concurrency,
                  timeout=args.timeout,
                  env=env,
                  stop_after=args.stop_after,
                  job_runner=cstar.jobrunner.RemoteJobRunner,
                  key_space=args.key_space,
                  output_directory=args.output_directory,
                  ignore_down_nodes=args.ignore_down_nodes,
                  dc_filter=args.dc_filter,
                  sleep_on_new_runner=args.ssh_pause_time,
                  sleep_after_done=args.node_done_pause_time,
                  ssh_username=args.ssh_username,
                  ssh_password=args.ssh_password,
                  ssh_identity_file=args.ssh_identity_file,
                  ssh_lib=args.ssh_lib,
                  jmx_username=args.jmx_username,
                  jmx_password=args.jmx_password,
                  jmx_passwordfile=args.jmx_passwordfile)
        job.run()
Exemple #3
0
def execute_continue(args):
    with cstar.job.Job() as job:
        try:
            cstar.jobreader.read(job, args.job_id, args.stop_after, max_days=args.max_job_age,
                                 output_directory=args.output_directory)
        except (FileTooOld, BadFileFormatVersion) as e:
            error(e)
        msg("Resuming job", job.job_id)
        msg("Running ", job.command)

        cstar.signalhandler.print_message_and_save_on_sigint(job, job.job_id)

        job.resume()
Exemple #4
0
def cleanup(max_days,
            listdir=os.listdir,
            jobread=cstar.jobreader.read,
            delete=shutil.rmtree):
    job_dir = os.path.expanduser('~/.cstar/jobs')
    for job_id in listdir(job_dir):
        try:
            jobread(cstar.job.Job(),
                    job_id,
                    stop_after=None,
                    max_days=max_days,
                    endpoint_mapper=lambda x: None)
        except Exception:
            msg("Removing job", job_id)
            full_name = os.path.join(job_dir, job_id)
            delete(full_name)
Exemple #5
0
 def handle_finished_jobs(self, finished_jobs):
     debug("Processing ", len(finished_jobs), " finished jobs")
     for finished_job in finished_jobs:
         host = finished_job[0]
         result = finished_job[1]
         if result.status != 0:
             self.errors.append((host, result))
             self.state = self.state.with_failed(host)
             msg("Failure on host", host.fqdn)
             if result.out:
                 msg("stdout:", result.out)
             if result.err:
                 msg("stderr:", result.err)
             self.do_loop = False
         else:
             self.state = self.state.with_done(host)
             info("Host %s finished successfully" % (host.fqdn, ))
             if result.out:
                 info("stdout:", result.out, sep="\n")
             if result.err:
                 info("stderr:", result.err)
             if self.sleep_after_done:
                 debug("Sleeping %d seconds..." % self.sleep_after_done)
                 time.sleep(self.sleep_after_done)
     cstar.jobwriter.write(self)
     # Signal the jobrunner that it can delete the remote job files and terminate.
     for finished_job in finished_jobs:
         host, result = finished_job
         self.handled_finished_jobs.add(host)
Exemple #6
0
    def print_outcome(self):
        if self.state.is_done() and not self.errors:
            if len(self.state.progress.done) == self.state.stop_after:
                cstar.jobwriter.write(self)
                msg("Job", self.job_id, "successfully ran on", self.state.stop_after, "hosts.\nTo finish the job, run",
                    emph("cstar continue %s" % (self.job_id,)))

            msg("Job", self.job_id, "finished successfully")
        else:
            msg("Job", self.job_id, "finished with errors.\n"
                                    "%s nodes finished successfully\n"
                                    "%s nodes had errors\n"
                                    "%s nodes didn't start executing"
                                    % (len(self.state.progress.done),
                                       len(self.state.progress.failed),
                                       len(self.state.original_topology) - len(self.state.progress.done) - len(self.state.progress.failed)))
Exemple #7
0
def execute_continue(args):
    msg("Retry : ", args.retry_failed)
    with cstar.job.Job() as job:
        try:
            cstar.jobreader.read(job, args.job_id, args.stop_after, max_days=args.max_job_age,
                                 output_directory=args.output_directory, retry=args.retry_failed)
        except (FileTooOld, BadFileFormatVersion) as e:
            error(e)
        msg("Resuming job", job.job_id)

        if job.jmx_username and not job.jmx_passwordfile:
            job.jmx_password = getpass.getpass(prompt="JMX Password ")

        msg("Running ", job.command)

        cstar.signalhandler.print_message_and_save_on_sigint(job, job.job_id)

        job.resume()
Exemple #8
0
    def setup(self, hosts, seeds, command, job_id, strategy, cluster_parallel,
              dc_parallel, job_runner, max_concurrency, timeout, env,
              stop_after, key_space, output_directory, ignore_down_nodes,
              dc_filter, sleep_on_new_runner, sleep_after_done, ssh_username,
              ssh_password, ssh_identity_file):

        msg("Starting setup")

        msg("Strategy:", cstar.strategy.serialize(strategy))
        msg("DC parallel:", dc_parallel)
        msg("Cluster parallel:", cluster_parallel)

        self.command = command
        self.job_id = job_id
        self.timeout = timeout
        self.env = env
        self.job_runner = job_runner
        self.key_space = key_space
        self.output_directory = output_directory or os.path.expanduser(
            "~/.cstar/jobs/" + job_id)
        self.sleep_on_new_runner = sleep_on_new_runner
        self.sleep_after_done = sleep_after_done
        self.ssh_username = ssh_username
        self.ssh_password = ssh_password
        self.ssh_identity_file = ssh_identity_file
        if not os.path.exists(self.output_directory):
            os.makedirs(self.output_directory)

        msg("Loading cluster topology")
        if seeds:
            current_topology = cstar.topology.Topology([])
            for seed in seeds:
                current_topology = current_topology | self.get_cluster_topology(
                    (seed, ))
            original_topology = current_topology
            if dc_filter:
                original_topology = original_topology.with_dc(dc_filter)
        else:
            current_topology = cstar.topology.Topology()
            hosts_ip_set = set(socket.gethostbyname(host) for host in hosts)
            for raw_host in hosts:
                host = socket.gethostbyname(raw_host)
                if host in current_topology:
                    continue
                current_topology = current_topology | self.get_cluster_topology(
                    (host, ))
            original_topology = cstar.topology.Topology(
                host for host in current_topology if host.ip in hosts_ip_set)
        msg("Done loading cluster topology")

        debug("Run on hosts", original_topology)
        debug("in topology", current_topology)

        msg("Generating endpoint mapping")
        if strategy is cstar.strategy.Strategy.TOPOLOGY:
            endpoint_mapping = self.get_endpoint_mapping(current_topology)
            msg("Done generating endpoint mapping")
        else:
            endpoint_mapping = None
            msg("Skipping endpoint mapping because of selected strategy")

        self.state = cstar.state.State(original_topology,
                                       strategy,
                                       endpoint_mapping,
                                       cluster_parallel,
                                       dc_parallel,
                                       max_concurrency,
                                       current_topology=current_topology,
                                       stop_after=stop_after,
                                       ignore_down_nodes=ignore_down_nodes)
        msg("Setup done")
Exemple #9
0
def execute_cleanup(args):
    msg('Cleaning up old jobs')
    cstar.cleanup.cleanup(args.max_job_age)