Ejemplo n.º 1
0
Archivo: job.py Proyecto: kant/cstar
    def get_endpoint_mapping(self, topology):
        count = 0
        tried_hosts = []
        for host in topology.get_up():
            tried_hosts.append(host)
            conn = self._connection(host)

            mappings = []

            if self.key_space:
                keyspaces = [self.key_space]
            else:
                keyspaces = self.get_keyspaces(conn)

            for keyspace in keyspaces:
                if keyspace != "system":
                    debug("Fetching endpoint mapping for keyspace", keyspace)
                    res = conn.run(("nodetool", "describering", keyspace))
                    has_error = False

                    if res.status != 0:
                        has_error = True
                        break
                    describering = cstar.nodetoolparser.parse_nodetool_describering(res.out)
                    range_mapping = cstar.nodetoolparser.convert_describering_to_range_mapping(describering)
                    mappings.append(cstar.endpoint_mapping.parse(range_mapping, topology, lookup=ip_lookup))

            if not has_error:
                return cstar.endpoint_mapping.merge(mappings)

            count += 1
            if count >= MAX_ATTEMPTS:
                break
        raise HostIsDown("Could not find any working host while fetching endpoint mapping. Tried the following hosts:",
                         ", ".join(host.fqdn for host in tried_hosts))
Ejemplo n.º 2
0
 def schedule_job(self, host):
     debug("Running on host", host.fqdn)
     threading.Thread(target=self.job_runner(self, host, self.ssh_username,
                                             self.ssh_password,
                                             self.ssh_identity_file),
                      name="cstar %s" % host.fqdn).start()
     time.sleep(self.sleep_on_new_runner)
Ejemplo n.º 3
0
    def _connect(self):
        if self.client:
            # Ensure underlying client is still a valid open connection
            try:
                stdin, stdout, stderr = self.client.exec_command(PING_COMMAND)
            except (ConnectionResetError, paramiko.ssh_exception.SSHException):
                # ConnectionResetError is raised when a connection was established but then broken
                # paramiko.ssh_exception.SSHException is raised if the connection was known to be broken
                self.client = None

        if not self.client:
            try:
                self.client = paramiko.client.SSHClient()
                pkey = None
                if self.ssh_identity_file != None:
                    pkey = paramiko.RSAKey.from_private_key_file(
                        self.ssh_identity_file, None)
                debug("Username : "******"Id file: ", self.ssh_identity_file)
                self.client.set_missing_host_key_policy(
                    paramiko.client.AutoAddPolicy())
                self.client.connect(self.hostname,
                                    compress=True,
                                    username=self.ssh_username,
                                    password=self.ssh_password,
                                    pkey=pkey)
            except:
                self.client = None
                raise BadSSHHost(
                    "Could not establish an SSH connection to host %s" %
                    (self.hostname, ))
Ejemplo n.º 4
0
 def handle_finished_jobs(self, finished_jobs):
     debug("Processing ", len(finished_jobs), " finished jobs")
     for finished_job in finished_jobs:
         host = finished_job[0]
         result = finished_job[1]
         if result.status != 0:
             self.errors.append((host, result))
             self.state = self.state.with_failed(host)
             msg("Failure on host", host.fqdn)
             if result.out:
                 msg("stdout:", result.out)
             if result.err:
                 msg("stderr:", result.err)
             self.do_loop = False
         else:
             self.state = self.state.with_done(host)
             info("Host %s finished successfully" % (host.fqdn, ))
             if result.out:
                 info("stdout:", result.out, sep="\n")
             if result.err:
                 info("stderr:", result.err)
             if self.sleep_after_done:
                 debug("Sleeping %d seconds..." % self.sleep_after_done)
                 time.sleep(self.sleep_after_done)
     cstar.jobwriter.write(self)
     # Signal the jobrunner that it can delete the remote job files and terminate.
     for finished_job in finished_jobs:
         host, result = finished_job
         self.handled_finished_jobs.add(host)
Ejemplo n.º 5
0
 def resume_on_running_hosts(self):
     for host in self.state.progress.running:
         debug("Resume on host", host.fqdn)
         threading.Thread(target=self.job_runner(
             self, host, self.ssh_username, self.ssh_password,
             self.ssh_identity_file, self.ssh_lib, self.hosts_variables),
                          name="cstar %s" % host.fqdn).start()
         time.sleep(self.sleep_on_new_runner)
Ejemplo n.º 6
0
    def get_endpoint_mapping(self, topology):
        clusters = []
        failed_hosts = []
        mappings = []
        count = 0

        endpoint_mappings = self.maybe_get_data_from_cache("endpoint_mapping")
        if endpoint_mappings is not None:
            return endpoint_mappings

        for host in topology.get_up():
            if host.cluster in clusters:
                # We need to fetch keyspaces on one node per cluster, no more.
                continue

            count = 0
            conn = self._connection(host)

            if self.key_space:
                keyspaces = [self.key_space]
            else:
                keyspaces = self.get_keyspaces(conn)
            has_error = True
            for keyspace in keyspaces:
                if not keyspace.startswith("system"):
                    debug("Fetching endpoint mapping for keyspace", keyspace)
                    res = self.run_nodetool(conn, *("describering", keyspace))
                    has_error = False

                    if res.status != 0 and not keyspace.startswith("system"):
                        has_error = True
                        break
                    describering = cstar.nodetoolparser.parse_nodetool_describering(
                        res.out)
                    range_mapping = cstar.nodetoolparser.convert_describering_to_range_mapping(
                        describering)
                    mappings.append(
                        cstar.endpoint_mapping.parse(range_mapping,
                                                     topology,
                                                     lookup=ip_lookup))

            if has_error:
                if count >= MAX_ATTEMPTS:
                    failed_hosts += host
                    break
            else:
                clusters.append(host.cluster)

            count += 1

        if failed_hosts:
            raise HostIsDown("Following hosts couldn't be reached: {}".format(
                ', '.join(host.fqdn for host in failed_hosts)))

        endpoint_mappings = cstar.endpoint_mapping.merge(mappings)
        pickle.dump(dict(endpoint_mappings),
                    open(self.get_cache_file_path("endpoint_mapping"), 'wb'))
        return endpoint_mappings
Ejemplo n.º 7
0
def _parse(input, file, output_directory, job, job_id, stop_after, max_days, endpoint_mapper):
    data = json.loads(input)

    if 'version' not in data:
        raise BadFileFormatVersion("Incompatible file format version, wanted %d" %
                                   (cstar.jobwriter.FILE_FORMAT_VERSION,))
    if data['version'] != cstar.jobwriter.FILE_FORMAT_VERSION:
        raise BadFileFormatVersion("Incompatible file format version, wanted %d but %s is of version %d" %
                                   (cstar.jobwriter.FILE_FORMAT_VERSION, file, data['version']))

    creation_time = datetime.datetime.utcfromtimestamp(data["creation_timestamp"])
    age = (datetime.datetime.utcnow() - creation_time).days
    if age > max_days:
        raise FileTooOld(("Job created %d days ago, which is more than the current maximum age of %d. " +
                          "Use --max-job-age %d if you really want to run this job.") % (age, max_days, age + 1))

    state = data['state']
    job.command = data['command']
    job.job_id = job_id
    job.timeout = data['timeout']
    job.env = data['env']
    job.job_runner = getattr(cstar.jobrunner, data["job_runner"])
    job.key_space = data['key_space'] if 'key_space' in data else None
    job.output_directory = output_directory
    job.sleep_on_new_runner = data['sleep_on_new_runner']

    strategy = cstar.strategy.parse(state['strategy'])
    cluster_parallel = state['cluster_parallel']
    dc_parallel = state['dc_parallel']
    max_concurrency = state['max_concurrency']

    progress = cstar.progress.Progress(
        running=[cstar.topology.Host(*arr) for arr in state['progress']['running']],
        done=[cstar.topology.Host(*arr) for arr in state['progress']['done']],
        failed=[cstar.topology.Host(*arr) for arr in state['progress']['failed']])

    original_topology = cstar.topology.Topology(cstar.topology.Host(*arr) for arr in state['original_topology'])
    current_topology = cstar.topology.Topology(cstar.topology.Host(*arr) for arr in state['current_topology'])

    debug("Run on hosts", original_topology)
    debug("in topology", current_topology)

    if strategy is cstar.strategy.Strategy.TOPOLOGY:
        endpoint_mapping = endpoint_mapper(original_topology)
    else:
        endpoint_mapping = None

    job.state = cstar.state.State(
        original_topology=original_topology,
        strategy=strategy,
        endpoint_mapping=endpoint_mapping,
        cluster_parallel=cluster_parallel,
        dc_parallel=dc_parallel,
        max_concurrency=max_concurrency,
        current_topology=current_topology,
        stop_after=stop_after,
        progress=progress,
        ignore_down_nodes=state['ignore_down_nodes'])
Ejemplo n.º 8
0
 def get_cache_file_path(self, cache_type):
     debug("Cache file: {}-{}-{}".format(
         cache_type, "-".join(sorted(self.schema_versions)),
         "-".join(sorted(self.status_topology_hash))))
     return os.path.join(
         self.cache_directory,
         "{}-{}-{}".format(cache_type,
                           "-".join(sorted(self.schema_versions)),
                           "-".join(sorted(self.status_topology_hash))))
Ejemplo n.º 9
0
Archivo: job.py Proyecto: kant/cstar
    def setup(self, hosts, seeds, command, job_id, strategy, cluster_parallel, dc_parallel, job_runner,
              max_concurrency, timeout, env, stop_after, key_space, output_directory,
              ignore_down_nodes, dc_filter,
              sleep_on_new_runner, sleep_after_done):

        msg("Starting setup")

        msg("Strategy:", cstar.strategy.serialize(strategy))
        msg("DC parallel:", dc_parallel)
        msg("Cluster parallel:", cluster_parallel)

        self.command = command
        self.job_id = job_id
        self.timeout = timeout
        self.env = env
        self.job_runner = job_runner
        self.key_space = key_space
        self.output_directory = output_directory or os.path.expanduser("~/.cstar/jobs/" + job_id)
        self.sleep_on_new_runner = sleep_on_new_runner
        self.sleep_after_done = sleep_after_done
        if not os.path.exists(self.output_directory):
            os.makedirs(self.output_directory)

        msg("Loading cluster topology")
        if seeds:
            current_topology = cstar.topology.Topology([])
            for seed in seeds:
                current_topology = current_topology | self.get_cluster_topology((seed,))
            original_topology = current_topology
            if dc_filter:
                original_topology = original_topology.with_dc(dc_filter)
        else:
            current_topology = cstar.topology.Topology()
            hosts_ip_set = set(socket.gethostbyname(host) for host in hosts)
            for raw_host in hosts:
                host = socket.gethostbyname(raw_host)
                if host in current_topology:
                    continue
                current_topology = current_topology | self.get_cluster_topology((host,))
            original_topology = cstar.topology.Topology(host for host in current_topology if host.ip in hosts_ip_set)
        msg("Done loading cluster topology")

        debug("Run on hosts", original_topology)
        debug("in topology", current_topology)

        msg("Generating endpoint mapping")
        if strategy is cstar.strategy.Strategy.TOPOLOGY:
            endpoint_mapping = self.get_endpoint_mapping(current_topology)
            msg("Done generating endpoint mapping")
        else:
            endpoint_mapping = None
            msg("Skipping endpoint mapping because of selected strategy")

        self.state = cstar.state.State(original_topology, strategy, endpoint_mapping, cluster_parallel, dc_parallel,
                                       max_concurrency, current_topology=current_topology, stop_after=stop_after,
                                       ignore_down_nodes=ignore_down_nodes)
        msg("Setup done")
Ejemplo n.º 10
0
 def read_file(self, remotepath):
     self._connect()
     debug("Retrieving %s through SCP" % (remotepath))
     channel, info = self.session.scp_recv(remotepath)
     if info.st_size == 0:
         return ""
     size, content = channel.read(info.st_size - 1)
     channel.close()
     return content.decode("utf-8")
Ejemplo n.º 11
0
Archivo: job.py Proyecto: yakirgb/cstar
    def get_host_variables(self, host):
        hostname = host
        if type(host).__name__ == "Host":
            hostname = host.fqdn

        host_variables = dict()
        if hostname in self.hosts_variables.keys():
            host_variables = self.hosts_variables[hostname]
        debug("Variables for host {} = {}".format(hostname, host_variables))
        return host_variables
Ejemplo n.º 12
0
 def put_file(self, localpath, remotepath):
     self._connect()
     fileinfo = os.stat(localpath)
     chan = self.session.scp_send64(remotepath, fileinfo.st_mode & 755,
                                    fileinfo.st_size, fileinfo.st_mtime,
                                    fileinfo.st_atime)
     debug("Starting SCP of local file %s to remote %s:%s" %
           (localpath, self.hostname, remotepath))
     with open(localpath, 'rb') as local_fh:
         for data in local_fh:
             chan.write(data)
Ejemplo n.º 13
0
Archivo: job.py Proyecto: yakirgb/cstar
 def maybe_get_data_from_cache(self, cache_type):
     try:
         cache_file = self.get_cache_file_path(cache_type)
         if os.path.exists(cache_file):
             debug("Getting {} from cache".format(cache_type))
             cached_data = pickle.load(open(cache_file, 'rb'))
             return cached_data
     except Exception:
         warn("Failed getting data from cache : {}".format(sys.exc_info()[2]))
     debug("Cache miss for {}".format(cache_type))
     return None
Ejemplo n.º 14
0
 def __init__(self, hostname, ssh_username, ssh_password, ssh_identity_file,
              ssh_lib):
     debug("Using ssh lib : ", ssh_lib)
     self.remote = None
     if ssh_lib == 'paramiko':
         self.remote = RemoteParamiko(hostname, ssh_username, ssh_password,
                                      ssh_identity_file)
     elif ssh_lib == 'ssh2':
         self.remote = RemoteSsh2(hostname, ssh_username, ssh_password,
                                  ssh_identity_file)
     else:
         raise BadArgument(
             "ssh-lib should be either 'paramiko' or 'ssh2' but we got '%s' instead."
             % (ssh_lib, ))
Ejemplo n.º 15
0
 def run(self, argv):
     try:
         cmd = " ".join(self.escape(s) for s in argv)
         self.exec_command(cmd)
         out, error, status = self.read_channel()
         if status != 0:
             err("Command %s failed with status %d on host %s" %
                 (cmd, status, self.hostname))
         else:
             debug("Command %s succeeded on host %s, output was %s and %s" %
                   (cmd, self.hostname, out, error))
         return ExecutionResult(cmd, status, out, error)
     except:
         self.client = None
         raise BadSSHHost("SSH connection to host %s was reset" %
                          (self.hostname, ))
Ejemplo n.º 16
0
    def wait_for_node_to_return(self, node):
        """Wait until node returns"""
        while True:
            try:
                self.update_current_topology((node, ))

                if self.state.is_healthy():
                    break
            except BadSSHHost as e:
                # If the instance used to poll cluster health is down it probably means that machine is rebooting
                # State is then NOT healthy, so continue waiting...
                debug("SSH to %s failed, instance down?" % (node, ), e)
            cstar.jobprinter.print_progress(
                self.state.original_topology, self.state.progress,
                self.state.current_topology.get_down())
            time.sleep(5)
Ejemplo n.º 17
0
    def run(self, argv):
        try:
            self._connect()
            cmd = " ".join(self.escape(s) for s in argv)

            stdin, stdout, stderr = self.client.exec_command(cmd)
            status = stdout.channel.recv_exit_status()
            out = stdout.read()
            error = stderr.read()
            if status != 0:
                err("Command %s failed with status %d on host %s" % (cmd, status, self.hostname))
            else:
                debug("Command %s succeeded on host %s, output was %s and %s" %
                      (cmd, self.hostname, str(out, 'utf-8'), str(error, 'utf-8')))
            return ExecutionResult(cmd, status, str(out, 'utf-8'), str(error, 'utf-8'))
        except (ConnectionResetError, paramiko.ssh_exception.SSHException):
            self.client = None
            raise BadSSHHost("SSH connection to host %s was reset" % (self.hostname,))
Ejemplo n.º 18
0
Archivo: job.py Proyecto: kant/cstar
 def schedule_job(self, host):
     debug("Running on host", host.fqdn)
     threading.Thread(target=self.job_runner(self, host),
                      name="cstar %s" % host.fqdn).start()
     time.sleep(self.sleep_on_new_runner)
Ejemplo n.º 19
0
Archivo: job.py Proyecto: kant/cstar
 def resume_on_running_hosts(self):
     for host in self.state.progress.running:
         debug("Resume on host", host.fqdn)
         threading.Thread(target=self.job_runner(self, host),
                          name="cstar %s" % host.fqdn).start()
         time.sleep(self.sleep_on_new_runner)
Ejemplo n.º 20
0
 def __init__(self, hostname, ssh_username, ssh_password, ssh_identity_file,
              ssh_lib):
     debug("Using ssh lib : ", ssh_lib)
     self.remote = RemoteParamiko(hostname, ssh_username, ssh_password,
                                  ssh_identity_file)