def get_endpoint_mapping(self, topology): count = 0 tried_hosts = [] for host in topology.get_up(): tried_hosts.append(host) conn = self._connection(host) mappings = [] if self.key_space: keyspaces = [self.key_space] else: keyspaces = self.get_keyspaces(conn) for keyspace in keyspaces: if keyspace != "system": debug("Fetching endpoint mapping for keyspace", keyspace) res = conn.run(("nodetool", "describering", keyspace)) has_error = False if res.status != 0: has_error = True break describering = cstar.nodetoolparser.parse_nodetool_describering(res.out) range_mapping = cstar.nodetoolparser.convert_describering_to_range_mapping(describering) mappings.append(cstar.endpoint_mapping.parse(range_mapping, topology, lookup=ip_lookup)) if not has_error: return cstar.endpoint_mapping.merge(mappings) count += 1 if count >= MAX_ATTEMPTS: break raise HostIsDown("Could not find any working host while fetching endpoint mapping. Tried the following hosts:", ", ".join(host.fqdn for host in tried_hosts))
def schedule_job(self, host): debug("Running on host", host.fqdn) threading.Thread(target=self.job_runner(self, host, self.ssh_username, self.ssh_password, self.ssh_identity_file), name="cstar %s" % host.fqdn).start() time.sleep(self.sleep_on_new_runner)
def _connect(self): if self.client: # Ensure underlying client is still a valid open connection try: stdin, stdout, stderr = self.client.exec_command(PING_COMMAND) except (ConnectionResetError, paramiko.ssh_exception.SSHException): # ConnectionResetError is raised when a connection was established but then broken # paramiko.ssh_exception.SSHException is raised if the connection was known to be broken self.client = None if not self.client: try: self.client = paramiko.client.SSHClient() pkey = None if self.ssh_identity_file != None: pkey = paramiko.RSAKey.from_private_key_file( self.ssh_identity_file, None) debug("Username : "******"Id file: ", self.ssh_identity_file) self.client.set_missing_host_key_policy( paramiko.client.AutoAddPolicy()) self.client.connect(self.hostname, compress=True, username=self.ssh_username, password=self.ssh_password, pkey=pkey) except: self.client = None raise BadSSHHost( "Could not establish an SSH connection to host %s" % (self.hostname, ))
def handle_finished_jobs(self, finished_jobs): debug("Processing ", len(finished_jobs), " finished jobs") for finished_job in finished_jobs: host = finished_job[0] result = finished_job[1] if result.status != 0: self.errors.append((host, result)) self.state = self.state.with_failed(host) msg("Failure on host", host.fqdn) if result.out: msg("stdout:", result.out) if result.err: msg("stderr:", result.err) self.do_loop = False else: self.state = self.state.with_done(host) info("Host %s finished successfully" % (host.fqdn, )) if result.out: info("stdout:", result.out, sep="\n") if result.err: info("stderr:", result.err) if self.sleep_after_done: debug("Sleeping %d seconds..." % self.sleep_after_done) time.sleep(self.sleep_after_done) cstar.jobwriter.write(self) # Signal the jobrunner that it can delete the remote job files and terminate. for finished_job in finished_jobs: host, result = finished_job self.handled_finished_jobs.add(host)
def resume_on_running_hosts(self): for host in self.state.progress.running: debug("Resume on host", host.fqdn) threading.Thread(target=self.job_runner( self, host, self.ssh_username, self.ssh_password, self.ssh_identity_file, self.ssh_lib, self.hosts_variables), name="cstar %s" % host.fqdn).start() time.sleep(self.sleep_on_new_runner)
def get_endpoint_mapping(self, topology): clusters = [] failed_hosts = [] mappings = [] count = 0 endpoint_mappings = self.maybe_get_data_from_cache("endpoint_mapping") if endpoint_mappings is not None: return endpoint_mappings for host in topology.get_up(): if host.cluster in clusters: # We need to fetch keyspaces on one node per cluster, no more. continue count = 0 conn = self._connection(host) if self.key_space: keyspaces = [self.key_space] else: keyspaces = self.get_keyspaces(conn) has_error = True for keyspace in keyspaces: if not keyspace.startswith("system"): debug("Fetching endpoint mapping for keyspace", keyspace) res = self.run_nodetool(conn, *("describering", keyspace)) has_error = False if res.status != 0 and not keyspace.startswith("system"): has_error = True break describering = cstar.nodetoolparser.parse_nodetool_describering( res.out) range_mapping = cstar.nodetoolparser.convert_describering_to_range_mapping( describering) mappings.append( cstar.endpoint_mapping.parse(range_mapping, topology, lookup=ip_lookup)) if has_error: if count >= MAX_ATTEMPTS: failed_hosts += host break else: clusters.append(host.cluster) count += 1 if failed_hosts: raise HostIsDown("Following hosts couldn't be reached: {}".format( ', '.join(host.fqdn for host in failed_hosts))) endpoint_mappings = cstar.endpoint_mapping.merge(mappings) pickle.dump(dict(endpoint_mappings), open(self.get_cache_file_path("endpoint_mapping"), 'wb')) return endpoint_mappings
def _parse(input, file, output_directory, job, job_id, stop_after, max_days, endpoint_mapper): data = json.loads(input) if 'version' not in data: raise BadFileFormatVersion("Incompatible file format version, wanted %d" % (cstar.jobwriter.FILE_FORMAT_VERSION,)) if data['version'] != cstar.jobwriter.FILE_FORMAT_VERSION: raise BadFileFormatVersion("Incompatible file format version, wanted %d but %s is of version %d" % (cstar.jobwriter.FILE_FORMAT_VERSION, file, data['version'])) creation_time = datetime.datetime.utcfromtimestamp(data["creation_timestamp"]) age = (datetime.datetime.utcnow() - creation_time).days if age > max_days: raise FileTooOld(("Job created %d days ago, which is more than the current maximum age of %d. " + "Use --max-job-age %d if you really want to run this job.") % (age, max_days, age + 1)) state = data['state'] job.command = data['command'] job.job_id = job_id job.timeout = data['timeout'] job.env = data['env'] job.job_runner = getattr(cstar.jobrunner, data["job_runner"]) job.key_space = data['key_space'] if 'key_space' in data else None job.output_directory = output_directory job.sleep_on_new_runner = data['sleep_on_new_runner'] strategy = cstar.strategy.parse(state['strategy']) cluster_parallel = state['cluster_parallel'] dc_parallel = state['dc_parallel'] max_concurrency = state['max_concurrency'] progress = cstar.progress.Progress( running=[cstar.topology.Host(*arr) for arr in state['progress']['running']], done=[cstar.topology.Host(*arr) for arr in state['progress']['done']], failed=[cstar.topology.Host(*arr) for arr in state['progress']['failed']]) original_topology = cstar.topology.Topology(cstar.topology.Host(*arr) for arr in state['original_topology']) current_topology = cstar.topology.Topology(cstar.topology.Host(*arr) for arr in state['current_topology']) debug("Run on hosts", original_topology) debug("in topology", current_topology) if strategy is cstar.strategy.Strategy.TOPOLOGY: endpoint_mapping = endpoint_mapper(original_topology) else: endpoint_mapping = None job.state = cstar.state.State( original_topology=original_topology, strategy=strategy, endpoint_mapping=endpoint_mapping, cluster_parallel=cluster_parallel, dc_parallel=dc_parallel, max_concurrency=max_concurrency, current_topology=current_topology, stop_after=stop_after, progress=progress, ignore_down_nodes=state['ignore_down_nodes'])
def get_cache_file_path(self, cache_type): debug("Cache file: {}-{}-{}".format( cache_type, "-".join(sorted(self.schema_versions)), "-".join(sorted(self.status_topology_hash)))) return os.path.join( self.cache_directory, "{}-{}-{}".format(cache_type, "-".join(sorted(self.schema_versions)), "-".join(sorted(self.status_topology_hash))))
def setup(self, hosts, seeds, command, job_id, strategy, cluster_parallel, dc_parallel, job_runner, max_concurrency, timeout, env, stop_after, key_space, output_directory, ignore_down_nodes, dc_filter, sleep_on_new_runner, sleep_after_done): msg("Starting setup") msg("Strategy:", cstar.strategy.serialize(strategy)) msg("DC parallel:", dc_parallel) msg("Cluster parallel:", cluster_parallel) self.command = command self.job_id = job_id self.timeout = timeout self.env = env self.job_runner = job_runner self.key_space = key_space self.output_directory = output_directory or os.path.expanduser("~/.cstar/jobs/" + job_id) self.sleep_on_new_runner = sleep_on_new_runner self.sleep_after_done = sleep_after_done if not os.path.exists(self.output_directory): os.makedirs(self.output_directory) msg("Loading cluster topology") if seeds: current_topology = cstar.topology.Topology([]) for seed in seeds: current_topology = current_topology | self.get_cluster_topology((seed,)) original_topology = current_topology if dc_filter: original_topology = original_topology.with_dc(dc_filter) else: current_topology = cstar.topology.Topology() hosts_ip_set = set(socket.gethostbyname(host) for host in hosts) for raw_host in hosts: host = socket.gethostbyname(raw_host) if host in current_topology: continue current_topology = current_topology | self.get_cluster_topology((host,)) original_topology = cstar.topology.Topology(host for host in current_topology if host.ip in hosts_ip_set) msg("Done loading cluster topology") debug("Run on hosts", original_topology) debug("in topology", current_topology) msg("Generating endpoint mapping") if strategy is cstar.strategy.Strategy.TOPOLOGY: endpoint_mapping = self.get_endpoint_mapping(current_topology) msg("Done generating endpoint mapping") else: endpoint_mapping = None msg("Skipping endpoint mapping because of selected strategy") self.state = cstar.state.State(original_topology, strategy, endpoint_mapping, cluster_parallel, dc_parallel, max_concurrency, current_topology=current_topology, stop_after=stop_after, ignore_down_nodes=ignore_down_nodes) msg("Setup done")
def read_file(self, remotepath): self._connect() debug("Retrieving %s through SCP" % (remotepath)) channel, info = self.session.scp_recv(remotepath) if info.st_size == 0: return "" size, content = channel.read(info.st_size - 1) channel.close() return content.decode("utf-8")
def get_host_variables(self, host): hostname = host if type(host).__name__ == "Host": hostname = host.fqdn host_variables = dict() if hostname in self.hosts_variables.keys(): host_variables = self.hosts_variables[hostname] debug("Variables for host {} = {}".format(hostname, host_variables)) return host_variables
def put_file(self, localpath, remotepath): self._connect() fileinfo = os.stat(localpath) chan = self.session.scp_send64(remotepath, fileinfo.st_mode & 755, fileinfo.st_size, fileinfo.st_mtime, fileinfo.st_atime) debug("Starting SCP of local file %s to remote %s:%s" % (localpath, self.hostname, remotepath)) with open(localpath, 'rb') as local_fh: for data in local_fh: chan.write(data)
def maybe_get_data_from_cache(self, cache_type): try: cache_file = self.get_cache_file_path(cache_type) if os.path.exists(cache_file): debug("Getting {} from cache".format(cache_type)) cached_data = pickle.load(open(cache_file, 'rb')) return cached_data except Exception: warn("Failed getting data from cache : {}".format(sys.exc_info()[2])) debug("Cache miss for {}".format(cache_type)) return None
def __init__(self, hostname, ssh_username, ssh_password, ssh_identity_file, ssh_lib): debug("Using ssh lib : ", ssh_lib) self.remote = None if ssh_lib == 'paramiko': self.remote = RemoteParamiko(hostname, ssh_username, ssh_password, ssh_identity_file) elif ssh_lib == 'ssh2': self.remote = RemoteSsh2(hostname, ssh_username, ssh_password, ssh_identity_file) else: raise BadArgument( "ssh-lib should be either 'paramiko' or 'ssh2' but we got '%s' instead." % (ssh_lib, ))
def run(self, argv): try: cmd = " ".join(self.escape(s) for s in argv) self.exec_command(cmd) out, error, status = self.read_channel() if status != 0: err("Command %s failed with status %d on host %s" % (cmd, status, self.hostname)) else: debug("Command %s succeeded on host %s, output was %s and %s" % (cmd, self.hostname, out, error)) return ExecutionResult(cmd, status, out, error) except: self.client = None raise BadSSHHost("SSH connection to host %s was reset" % (self.hostname, ))
def wait_for_node_to_return(self, node): """Wait until node returns""" while True: try: self.update_current_topology((node, )) if self.state.is_healthy(): break except BadSSHHost as e: # If the instance used to poll cluster health is down it probably means that machine is rebooting # State is then NOT healthy, so continue waiting... debug("SSH to %s failed, instance down?" % (node, ), e) cstar.jobprinter.print_progress( self.state.original_topology, self.state.progress, self.state.current_topology.get_down()) time.sleep(5)
def run(self, argv): try: self._connect() cmd = " ".join(self.escape(s) for s in argv) stdin, stdout, stderr = self.client.exec_command(cmd) status = stdout.channel.recv_exit_status() out = stdout.read() error = stderr.read() if status != 0: err("Command %s failed with status %d on host %s" % (cmd, status, self.hostname)) else: debug("Command %s succeeded on host %s, output was %s and %s" % (cmd, self.hostname, str(out, 'utf-8'), str(error, 'utf-8'))) return ExecutionResult(cmd, status, str(out, 'utf-8'), str(error, 'utf-8')) except (ConnectionResetError, paramiko.ssh_exception.SSHException): self.client = None raise BadSSHHost("SSH connection to host %s was reset" % (self.hostname,))
def schedule_job(self, host): debug("Running on host", host.fqdn) threading.Thread(target=self.job_runner(self, host), name="cstar %s" % host.fqdn).start() time.sleep(self.sleep_on_new_runner)
def resume_on_running_hosts(self): for host in self.state.progress.running: debug("Resume on host", host.fqdn) threading.Thread(target=self.job_runner(self, host), name="cstar %s" % host.fqdn).start() time.sleep(self.sleep_on_new_runner)
def __init__(self, hostname, ssh_username, ssh_password, ssh_identity_file, ssh_lib): debug("Using ssh lib : ", ssh_lib) self.remote = RemoteParamiko(hostname, ssh_username, ssh_password, ssh_identity_file)