def _eval_remove_node(self): """ This function uses the sge stats to decide whether or not to remove a node from the cluster. """ qlen = len(self.stat.get_queued_jobs()) if qlen != 0: return if not self.has_cluster_stabilized(): return num_nodes = len(self._cluster.nodes) if num_nodes <= self.min_nodes: log.info("Not removing nodes: already at or below minimum (%d)" % self.min_nodes) return max_remove = num_nodes - self.min_nodes log.info("Looking for nodes to remove...") remove_nodes = self._find_nodes_for_removal(max_remove=max_remove) if not remove_nodes: log.info("No nodes can be removed at this time") for node in remove_nodes: if node.update() != "running": log.error("Node %s is already dead - not removing" % node.alias) continue log.warn("Removing %s: %s (%s)" % (node.alias, node.id, node.dns_name)) try: self._cluster.remove_node(node) self.__last_cluster_mod_time = utils.get_utc_now() except Exception: log.error("Failed to remove node %s" % node.alias, exc_info=True)
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) except Exception: log.error("Failed to add new host", exc_info=True)
def setup_tmuxcc(self, client=None, nodes=None, user='******', layout='tiled'): log.info("Creating TMUX Control Center for user '%s'" % user) client = client or self._master nodes = nodes or self._nodes envname = self._envname orig_user = client.ssh._username if orig_user != user: client.ssh.connect(username=user) chunks = [chunk for chunk in utils.chunk_list(nodes, items=8)] num_windows = len(chunks) + len(nodes) if len(nodes) == 0: log.error("Cluster has no nodes, exiting...") return self.create_session(client, envname, num_windows=num_windows) if len(nodes) == 1 and client == nodes[0]: return if not self._supports_layout(client, envname, layout, window=0): log.warn("failed to select layout '%s', defaulting to " "'main-vertical'" % layout) layout = "main-vertical" status = self._select_layout(client, envname, layout, window=0) if status != 0: raise exception.PluginError("failed to set a layout") for i, chunk in enumerate(chunks): self._rename_window(client, envname, i, 'all%s' % i) for j, node in enumerate(chunk): if j != 0: self._split_window(client, envname, i) self._select_layout(client, envname, window=i, layout=layout) if node.alias != client.alias: self._send_keys(client, envname, cmd='ssh %s' % node.alias, window="%d.%d" % (i, j)) for i, node in enumerate(nodes): window = i + len(chunks) self._rename_window(client, envname, window, node.alias) if node.alias != client.alias: self._send_keys(client, envname, cmd='ssh %s' % node.alias, window=window) self._select_window(client, envname, window=0) self._select_pane(client, envname, window=0, pane=0) if orig_user != user: client.ssh.connect(username=orig_user)
def get_settings_from_env(self, settings): """ Returns AWS credentials defined in the user's shell environment. """ found = {} for key in settings: if key.upper() in os.environ: log.warn("Setting '%s' from environment..." % key.upper()) found[key] = os.environ.get(key.upper()) elif key in os.environ: log.warn("Setting '%s' from environment..." % key) found[key] = os.environ.get(key) return found
def execute(self, args): if not args: cls = [ c.cluster_tag for c in self.cm.get_clusters(load_plugins=False, load_receipt=False) ] msg = "please specify a cluster" if cls: opts = ', '.join(cls) msg = " ".join([msg, '(options:', opts, ')']) self.parser.error(msg) for cluster_name in args: try: cl = self.cm.get_cluster(cluster_name) except exception.ClusterDoesNotExist: raise except Exception, e: log.debug("Failed to load cluster settings!", exc_info=True) log.error("Failed to load cluster settings!") if self.opts.force: log.warn("Ignoring cluster settings due to --force option") cl = self.cm.get_cluster(cluster_name, load_receipt=False, require_keys=False) else: if not isinstance(e, exception.IncompatibleCluster): log.error("Use -f to forcefully stop the cluster") raise is_stoppable = cl.is_stoppable() if not is_stoppable: has_stoppable_nodes = cl.has_stoppable_nodes() if not self.opts.terminate_unstoppable and has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' contains 'stoppable' and 'unstoppable' " "nodes. Your options are:\n\n" "1. Use the --terminate-unstoppable option to " "stop all 'stoppable' nodes and terminate all " "'unstoppable' nodes\n\n" "2. Use the 'terminate' command to destroy the " "cluster.\n\nPass --help for more info." % cluster_name) if not has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' does not contain any 'stoppable' nodes " "and can only be terminated. Please use the " "'terminate' command instead to destroy the cluster." "\n\nPass --help for more info" % cluster_name) if not self.opts.confirm: resp = raw_input("Stop cluster %s (y/n)? " % cluster_name) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster(self.opts.terminate_unstoppable, force=self.opts.force) log.warn("All non-spot, EBS-backed nodes are now in a " "'stopped' state") log.warn("You can restart this cluster by passing -x " "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " "terminate this cluster")
def run(self, nodes, master, user, shell, volumes): mssh = master.ssh mssh.switch_user(user) botocfg = '/home/%s/.boto' % user if not mssh.path_exists(botocfg): log.info("Installing AWS credentials for user: %s" % user) if self.boto_cfg: log.info("Copying %s to %s" % (self.boto_cfg, botocfg)) mssh.put(self.boto_cfg, botocfg) else: log.info("Installing current credentials to: %s" % botocfg) f = mssh.remote_file(botocfg, 'w') f.write(BOTO_CFG_TEMPLATE % master.ec2.__dict__) f.close() mssh.chmod(0400, botocfg) else: log.warn("AWS credentials already present - skipping install")
def terminate(self, cluster_name, force=False): if force: log.warn("Ignoring cluster settings due to --force option") try: cl = self.cm.get_cluster(cluster_name, load_receipt=not force, require_keys=not force) if force: self._terminate_manually(cl) else: self._terminate_cluster(cl) except exception.ClusterDoesNotExist: raise except Exception: log.error("Failed to terminate cluster!", exc_info=True) if not force: log.error("Use -f to forcefully terminate the cluster") raise
def _start_notebook(self, master, user, profile_dir): log.info("Setting up IPython web notebook for user: %s" % user) user_cert = posixpath.join(profile_dir, '%s.pem' % user) ssl_cert = posixpath.join(profile_dir, '%s.pem' % user) if not master.ssh.isfile(user_cert): log.info("Creating SSL certificate for user %s" % user) ssl_subj = "/C=US/ST=SC/L=STAR/O=Dis/CN=%s" % master.dns_name master.ssh.execute("openssl req -new -newkey rsa:4096 -days 365 " '-nodes -x509 -subj %s -keyout %s -out %s' % (ssl_subj, ssl_cert, ssl_cert)) else: log.info("Using existing SSL certificate...") f = master.ssh.remote_file('%s/ipython_notebook_config.py' % profile_dir) notebook_port = 8888 sha1py = 'from IPython.lib import passwd; print passwd("%s")' sha1cmd = "python -c '%s'" % sha1py sha1pass = master.ssh.execute(sha1cmd % self.notebook_passwd)[0] f.write('\n'.join([ "c = get_config()", "c.IPKernelApp.pylab = 'inline'", "c.NotebookApp.certfile = u'%s'" % ssl_cert, "c.NotebookApp.ip = '*'", "c.NotebookApp.open_browser = False", "c.NotebookApp.password = u'%s'" % sha1pass, "c.NotebookApp.port = %d" % notebook_port, ])) f.close() if self.notebook_directory is not None: if not master.ssh.path_exists(self.notebook_directory): master.ssh.makedirs(self.notebook_directory) master.ssh.execute_async( "ipython notebook --no-browser --notebook-dir='%s'" % self.notebook_directory) else: master.ssh.execute_async("ipython notebook --no-browser") self._authorize_port(master, notebook_port, 'notebook') log.info("IPython notebook URL: https://%s:%s" % (master.dns_name, notebook_port)) log.info("The notebook password is: %s" % self.notebook_passwd) log.warn( "Please check your local firewall settings if you're having " "issues connecting to the IPython notebook", extra=dict(__textwrap__=True))
def _add_to_known_hosts(self, node): log.info("Configuring local known_hosts file") user_home = os.path.expanduser('~') khosts = os.path.join(user_home, '.ssh', 'known_hosts') if not os.path.isfile(khosts): log.warn("Unable to configure known_hosts: file does not exist") return contents = open(khosts).read() if node.dns_name not in contents: server_pkey = node.ssh.get_server_public_key() khostsf = open(khosts, 'a') if contents[-1] != '\n': khostsf.write('\n') name_entry = '%s,%s' % (node.dns_name, node.ip_address) khostsf.write(' '.join([ name_entry, server_pkey.get_name(), base64.b64encode(str(server_pkey)), '\n' ])) khostsf.close()
def get_stats(self): """ This method will ssh to the SGE master and get load & queue stats. It will feed these stats to SGEStats, which parses the XML. It will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc. """ log.debug("starting get_stats") retries = 5 for i in range(retries): try: return self._get_stats() except Exception: log.warn("Failed to retrieve stats (%d/%d):" % (i + 1, retries), exc_info=True) log.warn("Retrying in %ds" % self.polling_interval) time.sleep(self.polling_interval) raise exception.BaseException( "Failed to retrieve SGE stats after trying %d times, exiting..." % retries)
def _create_image_from_ebs(self, size=15): log.info("Creating new EBS AMI...") imgid = self.ec2.create_image(self.host.id, self.name, self.description) img = self.ec2.get_image(imgid) log.info("New EBS AMI created: %s" % imgid) root_dev = self.host.root_device_name if root_dev in self.host.block_device_mapping: log.info("Fetching block device mapping for %s" % imgid, extra=dict(__nonewline__=True)) s = Spinner() try: s.start() while root_dev not in img.block_device_mapping: img = self.ec2.get_image(imgid) time.sleep(5) finally: s.stop() snapshot_id = img.block_device_mapping[root_dev].snapshot_id snap = self.ec2.get_snapshot(snapshot_id) self.ec2.wait_for_snapshot(snap) else: log.warn("Unable to find root device - cant wait for snapshot") log.info("Waiting for %s to become available..." % imgid, extra=dict(__nonewline__=True)) s = Spinner() try: s.start() while img.state == "pending": time.sleep(15) if img.update() == "failed": raise exception.AWSError( "EBS image creation failed for %s" % imgid) finally: s.stop() return imgid
def warn_experimental(self, msg, num_secs=10): """ Warn user that an experimental feature is being used Counts down from num_secs before continuing """ sep = '*' * 60 log.warn('\n'.join([sep, msg, sep]), extra=dict(__textwrap__=True)) r = range(1, num_secs + 1) r.reverse() print log.warn("Waiting %d seconds before continuing..." % num_secs) log.warn("Press CTRL-C to cancel...") for i in r: sys.stdout.write('%d...' % i) sys.stdout.flush() time.sleep(1) print
def warn_debug_file_moved(): old_file = os.path.join(static.TMP_DIR, 'tethyscluster-debug-%s.log' % static.CURRENT_USER) if os.path.exists(old_file): stars = '*' * 50 log.warn(stars) log.warn("The default log file location is now:") log.warn("") log.warn(static.DEBUG_FILE) log.warn("") log.warn("Please delete or move the old log file located at:") log.warn("") log.warn(old_file) log.warn(stars)
def run(self, nodes, master, user, user_shell, volumes): log.info("Installing mysql-cluster-server on all nodes...") for node in nodes: self.pool.simple_job(self._install_mysql_cluster, (node), jobid=node.alias) self.pool.wait(len(nodes)) mconn = master.ssh mconn.execute('rm -f /usr/mysql-cluster/*') # Get IPs for all nodes self.mgm_ip = master.private_ip_address if not self._dedicated_query: self.storage_ips = [x.private_ip_address for x in nodes[1:]] self.query_ips = self.storage_ips self.data_nodes = nodes[1:] self.query_nodes = nodes else: self.data_nodes = nodes[1:self._num_data_nodes + 1] self.query_nodes = nodes[self._num_data_nodes + 1:] self.query_nodes.append(master) self.storage_ips = [x.private_ip_address for x in self.data_nodes] self.query_ips = [x.private_ip_address for x in self.query_nodes] # Create backup dir and change ownership of mysql-cluster dir log.info('Backing up and stopping all mysql processes on all nodes') for node in nodes: self.pool.simple_job(self._backup_and_reset, (node), jobid=node.alias) self.pool.wait(len(nodes)) # Generate and place ndb_mgmd configuration file log.info('Generating ndb_mgmd.cnf...') ndb_mgmd = mconn.remote_file('/etc/mysql/ndb_mgmd.cnf') ndb_mgmd.write(self.generate_ndb_mgmd()) ndb_mgmd.close() # Generate and place my.cnf configuration file on each data node log.info('Generating my.cnf on all nodes') for node in nodes: self.pool.simple_job(self._write_my_cnf, (node), jobid=node.alias) self.pool.wait(len(nodes)) # Restart mysql-ndb-mgm on master log.info('Restarting mysql-ndb-mgm on master node...') mconn.execute('/etc/init.d/mysql-ndb-mgm restart') # Start mysqld-ndb on data nodes log.info('Restarting mysql-ndb on all data nodes...') for node in self.data_nodes: self.pool.simple_job(node.ssh.execute, ('/etc/init.d/mysql-ndb restart'), jobid=node.alias) self.pool.wait(len(self.data_nodes)) # Start mysql on query nodes log.info('Starting mysql on all query nodes') for node in self.query_nodes: self.pool.simple_job(node.ssh.execute, ('/etc/init.d/mysql restart'), dict(ignore_exit_status=True), jobid=node.alias) self.pool.wait(len(self.query_nodes)) # Import sql dump dump_file = self._dump_file dump_dir = '/mnt/mysql-cluster-backup' if posixpath.isabs(self._dump_file): dump_dir, dump_file = posixpath.split(self._dump_file) else: log.warn("%s is not an absolute path, defaulting to %s" % (self._dump_file, posixpath.join(dump_dir, dump_file))) name, ext = posixpath.splitext(dump_file) sc_path = posixpath.join(dump_dir, name + '.sc' + ext) orig_path = posixpath.join(dump_dir, dump_file) if not mconn.isdir(dump_dir): log.info("Directory %s does not exist, creating..." % dump_dir) mconn.makedirs(dump_dir) if mconn.isfile(sc_path): mconn.execute('mysql < %s' % sc_path) elif mconn.isfile(orig_path): mconn.execute('mysql < %s' % orig_path) else: log.info('No dump file found, not importing.') log.info('Adding MySQL dump cronjob to master node') cronjob = self.generate_mysqldump_crontab(sc_path) mconn.remove_lines_from_file('/etc/crontab', '#tethyscluster-mysql') crontab_file = mconn.remote_file('/etc/crontab', 'a') crontab_file.write(cronjob) crontab_file.close() log.info('Management Node: %s' % master.alias) log.info('Data Nodes: \n%s' % '\n'.join([x.alias for x in self.data_nodes])) log.info('Query Nodes: \n%s' % '\n'.join([x.alias for x in self.query_nodes]))
def _setup_ebs_volumes(self): """ Mount EBS volumes, if specified in ~/.tethyscluster/config to /home """ # setup /etc/fstab on master to use block device if specified master = self._master devices = master.get_device_map() for vol in self._volumes: vol = self._volumes[vol] vol_id = vol.get("volume_id") mount_path = vol.get('mount_path') device = vol.get("device") volume_partition = vol.get('partition') if not (vol_id and device and mount_path): log.error("missing required settings for vol %s" % vol) continue if device not in devices and device.startswith('/dev/sd'): # check for "correct" device in unpatched kernels device = device.replace('/dev/sd', '/dev/xvd') if device not in devices: log.warn("Cannot find device %s for volume %s" % (device, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This usually means there was a problem " "attaching the EBS volume to the master node") continue partitions = master.get_partition_map(device=device) if not volume_partition: if len(partitions) == 0: volume_partition = device elif len(partitions) == 1: volume_partition = partitions.popitem()[0] else: log.error( "volume has more than one partition, please specify " "which partition to use (e.g. partition=0, " "partition=1, etc.) in the volume's config") continue elif volume_partition not in partitions: log.warn("Cannot find partition %s on volume %s" % (volume_partition, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This either means that the volume has not " "been partitioned or that the partition " "specified does not exist on the volume") continue log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path)) mount_map = master.get_mount_map() if volume_partition in mount_map: path, fstype, options = mount_map.get(volume_partition) if path != mount_path: log.error("Volume %s is mounted on %s, not on %s" % (vol_id, path, mount_path)) else: log.info("Volume %s already mounted on %s...skipping" % (vol_id, mount_path)) continue master.mount_device(volume_partition, mount_path)