def _eval_remove_node(self): """ This function uses the sge stats to decide whether or not to remove a node from the cluster. """ qlen = len(self.stat.get_queued_jobs()) if qlen != 0: return if not self.has_cluster_stabilized(): return num_nodes = len(self._cluster.nodes) if num_nodes <= self.min_nodes: log.info("Not removing nodes: already at or below minimum (%d)" % self.min_nodes) return max_remove = num_nodes - self.min_nodes log.info("Looking for nodes to remove...") remove_nodes = self._find_nodes_for_removal(max_remove=max_remove) if not remove_nodes: log.info("No nodes can be removed at this time") for node in remove_nodes: if node.update() != "running": log.error("Node %s is already dead - not removing" % node.alias) continue log.warn("Removing %s: %s (%s)" % (node.alias, node.id, node.dns_name)) try: self._cluster.remove_node(node) self.__last_cluster_mod_time = datetime.datetime.utcnow() except Exception: log.error("Failed to remove node %s" % node.alias, exc_info=True)
def _create_image_from_ebs(self, size=15): log.info("Creating new EBS AMI...") imgid = self.ec2.create_image(self.host.id, self.name, self.description) img = self.ec2.get_image(imgid) log.info("New EBS AMI created: %s" % imgid) root_dev = self.host.root_device_name if root_dev in self.host.block_device_mapping: log.info("Fetching block device mapping for %s" % imgid, extra=dict(__nonewline__=True)) s = Spinner() try: s.start() while root_dev not in img.block_device_mapping: img = self.ec2.get_image(imgid) time.sleep(5) finally: s.stop() snapshot_id = img.block_device_mapping[root_dev].snapshot_id snap = self.ec2.get_snapshot(snapshot_id) self.ec2.wait_for_snapshot(snap) else: log.warn("Unable to find root device - cant wait for snapshot") log.info("Waiting for %s to become available..." % imgid, extra=dict(__nonewline__=True)) s = Spinner() try: s.start() while img.state == "pending": time.sleep(15) if img.update() == "failed": raise exception.AWSError("EBS image creation failed for %s" % imgid) finally: s.stop() return imgid
def copy_remote_file_to_nodes(self, remote_file, nodes, dest=None): """ Copies a remote file from this Node instance to another Node instance without passwordless ssh between the two. dest - path to store the data in on the node (defaults to remote_file) """ if not dest: dest = remote_file rf = self.ssh.remote_file(remote_file, 'r') contents = rf.read() sts = rf.stat() mode = stat.S_IMODE(sts.st_mode) uid = sts.st_uid gid = sts.st_gid rf.close() for node in nodes: if self.id == node.id and remote_file == dest: log.warn("src and destination are the same: %s, skipping" % remote_file) continue nrf = node.ssh.remote_file(dest, 'w') nrf.write(contents) nrf.chown(uid, gid) nrf.chmod(mode) nrf.close()
def execute(self, args): if not args: self.parser.error("please specify a cluster") for cluster_name in args: cl = self.cm.get_cluster(cluster_name) is_ebs = cl.is_ebs_cluster() if not self.opts.confirm: action = "Terminate" if is_ebs: action = "Stop EBS" if cl.spot_bid: action = "Terminate Spot EBS" resp = raw_input("%s cluster %s (y/n)? " % (action, cluster_name)) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster() if is_ebs and cl._nodes: log.warn(("All EBS-backed nodes in '%s' are now in a " + \ "'stopped' state") % cluster_name) log.warn("You can restart this cluster by passing -x " + \ "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " + \ "terminate this cluster") log.warn("NOTE: Unless EBS-backed nodes are in a " + \ "'running' or 'terminated'") log.warn("state, you are charged for the EBS volumes " + \ "backing the nodes.")
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) except Exception: log.error("Failed to add new host", exc_info=True)
def run(self, nodes, master, user, user_shell, volumes): self._master = master self._new_security_group = master.cluster_groups[0].id log.info("Configuring RAID") # do a suitable check for lvm2 needs_lvm2 = True if needs_lvm2: try: node.ssh.execute("echo 'APT::Periodic::Enable \"0\";' >> /etc/apt/apt.conf.d/10periodic") except Exception, e: print e log.warn(e) # Ubuntu 16 has a very stupid new default # https://github.com/geerlingguy/packer-ubuntu-1604/issues/3#issue-154560190 try: log.info("killing any running apt-get") node.ssh.execute("killall apt-get") node.ssh.execute("dpkg --configure -a") node.ssh.execute("apt-get update") node.ssh.execute("apt-get upgrade") log.info("clean kill") except Exception, e: log.info("not a clean kill") print e log.warn(e)
def load(self): """ Populate this config object from the StarCluster config """ log.debug('Loading config') try: self.globals = self._load_section('global', self.global_settings) except exception.ConfigSectionMissing: pass try: self.aws = self._load_section('aws info', self.aws_settings) except exception.ConfigSectionMissing: log.warn("No [aws info] section found in the config!") self.aws.update(self.get_settings_from_env(self.aws_settings)) self.keys = self._load_sections('key', self.key_settings) self.vols = self._load_sections('volume', self.volume_settings) self.vols.update(self._load_sections('vol', self.volume_settings)) self.plugins = self._load_sections('plugin', self.plugin_settings, filter_settings=False) self.permissions = self._load_sections('permission', self.permission_settings) sections = self._get_sections('cluster') self.clusters = self._load_cluster_sections(sections) return self
def copy_remote_file_to_nodes(self, remote_file, nodes, dest=None): """ Copies a remote file from this Node instance to another Node instance without passwordless ssh between the two. dest - path to store the data in on the node (defaults to remote_file) """ if not dest: dest = remote_file rf = self.ssh.remote_file(remote_file, 'r') sts = rf.stat() mode = stat.S_IMODE(sts.st_mode) uid = sts.st_uid gid = sts.st_gid rf.close() with tempfile.NamedTemporaryFile( prefix=os.path.basename(remote_file) + "_") as f: self.ssh.get(remote_file, f.name) for node in nodes: if self.id == node.id and remote_file == dest: log.warn("src and destination are the same: %s, skipping" % remote_file) continue node.ssh.put(f.name, dest) nrf = node.ssh.remote_file(dest, 'a') nrf.chown(uid, gid) nrf.chmod(mode) nrf.close()
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum ({:})".format(self.min_nodes)) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: #no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(datetime.datetime.utcnow()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = datetime.datetime.utcnow() log.info("Done adding nodes at %s" % str(datetime.datetime.utcnow())) except Exception: log.error("Failed to add new host", exc_info=True)
def _eval_remove_node(self): """ This function uses the sge stats to decide whether or not to remove a node from the cluster. """ qlen = len(self.stat.get_queued_jobs()) if qlen != 0: return if not self.has_cluster_stabilized(): return num_nodes = len(self._cluster.nodes) if num_nodes <= self.min_nodes: log.info("Not removing nodes: already at or below minimum (%d)" % self.min_nodes) return max_remove = num_nodes - self.min_nodes log.info("Looking for nodes to remove...") remove_nodes = self._find_nodes_for_removal(max_remove=max_remove) if not remove_nodes: log.info("No nodes can be removed at this time") for node in remove_nodes: if node.update() != "running": log.error("Node %s is already dead - not removing" % node.alias) continue log.warn("Removing %s: %s (%s)" % (node.alias, node.id, node.dns_name)) try: self._cluster.remove_node(node) self.__last_cluster_mod_time = utils.get_utc_now() except Exception: log.error("Failed to remove node %s" % node.alias, exc_info=True)
def _validate_zone(self, zone): z = self.ec2.get_zone_or_none(zone) if not z: raise exception.ValidationError( 'zone %s does not exist' % zone) if z.state != 'available': log.warn('zone %s is not available at this time' % zone) return True
def _get_ipcluster_plugin(self, node): ipyversion = self._get_ipy_version(node) if ipyversion < '0.11': if not ipyversion.startswith('0.10'): log.warn("Trying unsupported IPython version %s" % ipyversion) return IPCluster10() else: return IPCluster11(self.enable_notebook, self.notebook_passwd)
def _validate_zone(self): availability_zone = self.availability_zone if availability_zone: zone = self.ec2.get_zone(availability_zone) if not zone: raise exception.ClusterValidationError("availability_zone = %s does not exist" % availability_zone) if zone.state != "available": log.warn("The availability_zone = %s " % zone + "is not available at this time") return True
def _has_all_required_settings(self): has_all_required = True for opt in self.__cluster_settings: requirements = self.__cluster_settings[opt] name = opt required = requirements[1] if required and self.get(name.lower()) is None: log.warn("Missing required setting %s" % name) has_all_required = False return has_all_required
def __init__(self, host, username = None, password = None, private_key = None, private_key_pass = None, port = 22, timeout=30, ): self._timeout = timeout self._sftp_live = False self._sftp = None if not username: username = os.environ['LOGNAME'] # Log to a temporary file. templog = tempfile.mkstemp('.txt', 'ssh-')[1] paramiko.util.log_to_file(templog) # Begin the SSH transport. self._transport_live = False try: sock = self._get_socket(host, port) self._transport = paramiko.Transport(sock) self._transport.banner_timeout = self._timeout except socket.error: raise exception.SSHConnectionError(host, port) self._transport_live = True # Authenticate the transport. if password: # Using Password. try: self._transport.connect(username = username, password = password) except paramiko.AuthenticationException: raise exception.SSHAuthException(username,host) elif private_key: # Use Private Key. pkey = None log.debug('private key specified') if private_key.endswith('rsa') or private_key.count('rsa'): pkey = self._load_rsa_key(private_key, private_key_pass) elif private_key.endswith('dsa') or private_key.count('dsa'): pkey = self._load_dsa_key(private_key, private_key_pass) else: log.warn("specified key does not end in either rsa or dsa, trying both") pkey = self._load_rsa_key(private_key, private_key_pass) if pkey is None: pkey = self._load_dsa_key(private_key, private_key_pass) try: self._transport.connect(username = username, pkey = pkey) except paramiko.AuthenticationException: raise exception.SSHAuthException(username, host) except paramiko.SSHException,e: msg = e.args[0] raise exception.SSHError(msg)
def _eval_add_node(self): """ This function uses the metrics available to it to decide whether to add a new node to the cluster or not. It isn't able to add a node yet. TODO: See if the recent jobs have taken more than 5 minutes (how long it takes to start an instance) """ if len(self.stat.hosts) >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return need_to_add = 0 total_slots_required=0 qjobs =self.stat.get_queued_jobs() for q in qjobs: total_slots_required = total_slots_required +int(q['slots']) qlen = len(self.stat.get_queued_jobs()) sph = self.stat.slots_per_host() ts = self.stat.count_total_slots() num_exec_hosts = len(self.stat.hosts) #calculate estimated time to completion ettc = 0 if num_exec_hosts > 0: #calculate job duration avg_duration = self.stat.avg_job_duration() ettc = avg_duration * total_slots_required / num_exec_hosts if total_slots_required > ts: if not self.has_cluster_stabilized(): return #there are more jobs queued than will be consumed with one #cycle of job processing from all nodes oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d sec, longer than " "max %d" % (age_delta.seconds, self.longest_allowed_queue_time)) need_to_add = total_slots_required / sph if sph != 0 else 1 if 0 < ettc < 600 and not self.stat.on_first_job(): log.warn("There is a possibility that the job queue is" " shorter than 10 minutes in duration") max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(datetime.datetime.utcnow()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = datetime.datetime.utcnow() log.info("Done adding nodes at %s" % str(datetime.datetime.utcnow())) except Exception: log.error("Failed to add new host") log.debug(traceback.format_exc())
def _eval_add_node(self): """ This function uses the metrics available to it to decide whether to add a new node to the cluster or not. It isn't able to add a node yet. TODO: See if the recent jobs have taken more than 5 minutes (how long it takes to start an instance) """ if len(self.stat.hosts) >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return need_to_add = 0 total_slots_required = 0 qjobs = self.stat.get_queued_jobs() for q in qjobs: total_slots_required = total_slots_required + int(q['slots']) qlen = len(self.stat.get_queued_jobs()) sph = self.stat.slots_per_host() ts = self.stat.count_total_slots() num_exec_hosts = len(self.stat.hosts) #calculate estimated time to completion ettc = 0 if num_exec_hosts > 0: #calculate job duration avg_duration = self.stat.avg_job_duration() ettc = avg_duration * total_slots_required / num_exec_hosts if total_slots_required > ts: if not self.has_cluster_stabilized(): return #there are more jobs queued than will be consumed with one #cycle of job processing from all nodes oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d sec, longer than " "max %d" % (age_delta.seconds, self.longest_allowed_queue_time)) need_to_add = total_slots_required / sph if sph != 0 else 1 if 0 < ettc < 600 and not self.stat.on_first_job(): log.warn("There is a possibility that the job queue is" " shorter than 10 minutes in duration") max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(datetime.datetime.utcnow()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = datetime.datetime.utcnow() log.info("Done adding nodes at %s" % str(datetime.datetime.utcnow())) except Exception: log.error("Failed to add new host") log.debug(traceback.format_exc())
def _eval_add_node(self): """ This function uses the metrics available to it to decide whether to add a new node to the cluster or not. It isn't able to add a node yet. TODO: See if the recent jobs have taken more than 5 minutes (how long it takes to start an instance) """ need_to_add = 0 if len(self.stat.hosts) >= self.max_nodes: log.info("Won't add another host, currently at max (%d)." % \ self.max_nodes) return 0 qlen = len(self.stat.get_queued_jobs()) sph = self.stat.slots_per_host() ts = self.stat.count_total_slots() #calculate job duration avg_duration = self.stat.avg_job_duration() #calculate estimated time to completion ettc = avg_duration * qlen / len(self.stat.hosts) if qlen > ts: now = datetime.datetime.utcnow() if (now - self.__last_cluster_mod_time).seconds < \ self.stabilization_time: log.info( "Cluster change made less than %d seconds ago (%s)." % (self.stabilization_time, self.__last_cluster_mod_time)) log.info("Not changing cluster size until cluster stabilizes.") return 0 #there are more jobs queued than will be consumed with one #cycle of job processing from all nodes oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d sec, longer than " \ "max %d." % (age_delta.seconds, self.longest_allowed_queue_time)) need_to_add = qlen / sph if ettc < 600 and not self.stat.on_first_job(): log.warn("There is a possibility that the job queue is" + \ " shorter than 10 minutes in duration.") #need_to_add = 0 if need_to_add > 0: need_to_add = min(self.add_nodes_per_iteration, need_to_add) log.info("*** ADDING %d NODES." % need_to_add) try: self._cluster.add_nodes(need_to_add) except Exception: log.error("Failed to add new host.") log.debug(traceback.format_exc()) return -1 self.__last_cluster_mod_time = datetime.datetime.utcnow() log.info("Done adding nodes.") return need_to_add
def shell(self, user=None, forward_x11=False, forward_agent=False, command=None): """ Attempts to launch an interactive shell by first trying the system's ssh client. If the system does not have the ssh command it falls back to a pure-python ssh shell. """ if self.update() != 'running': try: alias = self.alias except exception.BaseException: alias = None label = 'instance' if alias == "master": label = "master" alias = "node" elif alias: label = "node" instance_id = alias or self.id raise exception.InstanceNotRunning(instance_id, self.state, label=label) user = user or self.user if utils.has_required(['ssh']): log.debug("Using native OpenSSH client") sshopts = '-i %s' % self.key_location if forward_x11: sshopts += ' -Y' if forward_agent: sshopts += ' -A' ssh_cmd = static.SSH_TEMPLATE % dict( opts=sshopts, user=user, host=self.dns_name) if command: command = "'source /etc/profile && %s'" % command ssh_cmd = ' '.join([ssh_cmd, command]) log.debug("ssh_cmd: %s" % ssh_cmd) return subprocess.call(ssh_cmd, shell=True) else: log.debug("Using Pure-Python SSH client") if forward_x11: log.warn("X11 Forwarding not available in Python SSH client") if forward_agent: log.warn("Authentication agent forwarding not available in " + "Python SSH client") if command: orig_user = self.ssh.get_current_user() self.ssh.switch_user(user) self.ssh.execute(command, silent=False) self.ssh.switch_user(orig_user) return self.ssh.get_last_status() self.ssh.interactive_shell(user=user)
def _eval_add_node(self): """ This function uses the metrics available to it to decide whether to add a new node to the cluster or not. It isn't able to add a node yet. TODO: See if the recent jobs have taken more than 5 minutes (how long it takes to start an instance) """ need_to_add = 0 if len(self.stat.hosts) >= self.max_nodes: log.info("Won't add another host, currently at max (%d)." % \ self.max_nodes) return 0 qlen = len(self.stat.get_queued_jobs()) sph = self.stat.slots_per_host() ts = self.stat.count_total_slots() #calculate job duration avg_duration = self.stat.avg_job_duration() #calculate estimated time to completion ettc = avg_duration * qlen / len(self.stat.hosts) if qlen > ts: now = datetime.datetime.utcnow() if (now - self.__last_cluster_mod_time).seconds < \ self.stabilization_time: log.info("Cluster change made less than %d seconds ago (%s)." % (self.stabilization_time, self.__last_cluster_mod_time)) log.info("Not changing cluster size until cluster stabilizes.") return 0 #there are more jobs queued than will be consumed with one #cycle of job processing from all nodes oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d sec, longer than " \ "max %d." % (age_delta.seconds, self.longest_allowed_queue_time)) need_to_add = qlen / sph if ettc < 600 and not self.stat.on_first_job(): log.warn("There is a possibility that the job queue is" + \ " shorter than 10 minutes in duration.") #need_to_add = 0 if need_to_add > 0: need_to_add = min(self.add_nodes_per_iteration, need_to_add) log.info("*** ADDING %d NODES." % need_to_add) try: self._cluster.add_nodes(need_to_add) except Exception: log.error("Failed to add new host.") log.debug(traceback.format_exc()) return -1 self.__last_cluster_mod_time = datetime.datetime.utcnow() log.info("Done adding nodes.") return need_to_add
def get_qatime(self, now): """ this function takes the lookback window and creates a string representation of the past few hours, to feed to qacct to limit the data set qacct returns. """ if self.lookback_window > 24 or self.lookback_window < 1: log.warn("Lookback window %d out of range (1-24). Not recommended." % self.lookback_window) now = now - datetime.timedelta(hours=self.lookback_window) str = now.strftime("%Y%m%d%H%M") return str
def get_qatime(self, now): """ this function takes the lookback window and creates a string representation of the past few hours, to feed to qacct to limit the data set qacct returns. """ if self.lookback_window > 24 or self.lookback_window < 1: log.warn( "Lookback window %d out of range (1-24). Not recommended." % self.lookback_window) now = now - datetime.timedelta(hours=self.lookback_window) str = now.strftime("%Y%m%d%H%M") return str
def get_settings_from_env(self, settings): """ Returns AWS credentials defined in the user's shell environment. """ found = {} for key in settings: if key.upper() in os.environ: log.warn("Setting '%s' from environment..." % key.upper()) found[key] = os.environ.get(key.upper()) elif key in os.environ: log.warn("Setting '%s' from environment..." % key) found[key] = os.environ.get(key) return found
def root_device_name(self): root_dev = self.instance.root_device_name bmap = self.block_device_mapping if bmap and root_dev not in bmap and self.is_ebs_backed(): # Hack for misconfigured AMIs (e.g. CentOS 6.3 Marketplace) These # AMIs have root device name set to /dev/sda1 but no /dev/sda1 in # block device map - only /dev/sda. These AMIs somehow magically # work so check if /dev/sda exists and return that instead to # prevent detach_external_volumes() from trying to detach the root # volume on these AMIs. log.warn("Root device %s is not in the block device map" % root_dev) log.warn("This means the AMI was registered with either " "an incorrect root device name or an incorrect block " "device mapping") sda, sda1 = '/dev/sda', '/dev/sda1' if root_dev == sda1: log.info("Searching for possible root device: %s" % sda) if sda in self.block_device_mapping: log.warn("Found '%s' - assuming its the real root device" % sda) root_dev = sda else: log.warn("Device %s isn't in the block device map either" % sda) return root_dev
def shell(self, user=None, forward_x11=False, forward_agent=False, command=None): """ Attempts to launch an interactive shell by first trying the system's ssh client. If the system does not have the ssh command it falls back to a pure-python ssh shell. """ if self.update() != 'running': try: alias = self.alias except exception.BaseException: alias = None label = 'instance' if alias == "master": label = "master" alias = "node" elif alias: label = "node" instance_id = alias or self.id raise exception.InstanceNotRunning(instance_id, self.state, label=label) user = user or self.user if utils.has_required(['ssh']): log.debug("Using native OpenSSH client") sshopts = '-i %s' % self.key_location if forward_x11: sshopts += ' -Y' if forward_agent: sshopts += ' -A' ssh_cmd = static.SSH_TEMPLATE % dict(opts=sshopts, user=user, host=self.dns_name) if command: command = "'source /etc/profile && %s'" % command ssh_cmd = ' '.join([ssh_cmd, command]) log.debug("ssh_cmd: %s" % ssh_cmd) return subprocess.call(ssh_cmd, shell=True) else: log.debug("Using Pure-Python SSH client") if forward_x11: log.warn("X11 Forwarding not available in Python SSH client") if forward_agent: log.warn("Authentication agent forwarding not available in " + "Python SSH client") if command: orig_user = self.ssh.get_current_user() self.ssh.switch_user(user) self.ssh.execute(command, silent=False) self.ssh.switch_user(orig_user) return self.ssh.get_last_status() self.ssh.interactive_shell(user=user)
def setup_tmuxcc(self, client=None, nodes=None, user='******', layout='tiled'): log.info("Creating TMUX Control Center for user '%s'" % user) client = client or self._master nodes = nodes or self._nodes envname = self._envname orig_user = client.ssh._username if orig_user != user: client.ssh.connect(username=user) chunks = [chunk for chunk in utils.chunk_list(nodes, items=8)] num_windows = len(chunks) + len(nodes) if len(nodes) == 0: log.error("Cluster has no nodes, exiting...") return self.create_session(client, envname, num_windows=num_windows) if len(nodes) == 1 and client == nodes[0]: return if not self._supports_layout(client, envname, layout, window=0): log.warn("failed to select layout '%s', defaulting to " "'main-vertical'" % layout) layout = "main-vertical" status = self._select_layout(client, envname, layout, window=0) if status != 0: raise exception.PluginError("failed to set a layout") for i, chunk in enumerate(chunks): self._rename_window(client, envname, i, 'all%s' % i) for j, node in enumerate(chunk): if j != 0: self._split_window(client, envname, i) self._select_layout(client, envname, window=i, layout=layout) if node.alias != client.alias: self._send_keys(client, envname, cmd='ssh %s' % node.alias, window="%d.%d" % (i, j)) for i, node in enumerate(nodes): window = i + len(chunks) self._rename_window(client, envname, window, node.alias) if node.alias != client.alias: self._send_keys(client, envname, cmd='ssh %s' % node.alias, window=window) self._select_window(client, envname, window=0) self._select_pane(client, envname, window=0, pane=0) if orig_user != user: client.ssh.connect(username=orig_user)
def execute(self, args): if not args: cls = [ c.cluster_tag for c in self.cm.get_clusters(load_plugins=False, load_receipt=False) ] msg = "please specify a cluster" if cls: opts = ', '.join(cls) msg = " ".join([msg, '(options:', opts, ')']) self.parser.error(msg) for cluster_name in args: try: cl = self.cm.get_cluster(cluster_name) except exception.ClusterDoesNotExist: raise except Exception, e: log.debug("Failed to load cluster settings!", exc_info=True) log.error("Failed to load cluster settings!") if self.opts.force: log.warn("Ignoring cluster settings due to --force option") cl = self.cm.get_cluster(cluster_name, load_receipt=False, require_keys=False) else: if not isinstance(e, exception.IncompatibleCluster): log.error("Use -f to forcefully stop the cluster") raise is_stoppable = cl.is_stoppable() if not is_stoppable: has_stoppable_nodes = cl.has_stoppable_nodes() if not self.opts.terminate_unstoppable and has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' contains 'stoppable' and 'unstoppable' " "nodes. Your options are:\n\n" "1. Use the --terminate-unstoppable option to " "stop all 'stoppable' nodes and terminate all " "'unstoppable' nodes\n\n" "2. Use the 'terminate' command to destroy the " "cluster.\n\nPass --help for more info." % cluster_name) if not has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' does not contain any 'stoppable' nodes " "and can only be terminated. Please use the " "'terminate' command instead to destroy the cluster." "\n\nPass --help for more info" % cluster_name) if not self.opts.confirm: resp = raw_input("Stop cluster %s (y/n)? " % cluster_name) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster(self.opts.terminate_unstoppable, force=self.opts.force) log.warn("All non-spot, EBS-backed nodes are now in a " "'stopped' state") log.warn("You can restart this cluster by passing -x " "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " "terminate this cluster")
def _warn_about_volume_hosts(self): sg = self.cluster_group vol_hosts = filter(lambda x: x.state in ['running', 'pending'], sg.instances()) vol_hosts = map(lambda x: x.id, vol_hosts) if vol_hosts: log.warn("There are still volume hosts running: %s" % \ ', '.join(vol_hosts)) log.warn(("Run 'starcluster terminate %s' to terminate *all* " + \ "volume host instances once they're no longer needed") % \ static.VOLUME_GROUP_NAME) else: log.info("No active volume hosts found. Run 'starcluster " + \ "terminate %(g)s' to remove the '%(g)s' group" % \ {'g': static.VOLUME_GROUP_NAME})
def _add_to_known_hosts(self, node): log.info("Configuring local known_hosts file") user_home = os.path.expanduser("~") khosts = os.path.join(user_home, ".ssh", "known_hosts") if not os.path.isfile(khosts): log.warn("Unable to configure known_hosts: file does not exist") return contents = open(khosts).read() if node.dns_name not in contents: server_pkey = node.ssh.get_server_public_key() khostsf = open(khosts, "a") if contents[-1] != "\n": khostsf.write("\n") name_entry = "%s,%s" % (node.dns_name, node.ip_address) khostsf.write(" ".join([name_entry, server_pkey.get_name(), base64.b64encode(str(server_pkey)), "\n"])) khostsf.close()
def terminate(self, cluster_name, force=False): if force: log.warn("Ignoring cluster settings due to --force option") try: cl = self.cm.get_cluster(cluster_name, load_receipt=not force, require_keys=not force) if force: self._terminate_manually(cl) else: self._terminate_cluster(cl) except exception.ClusterDoesNotExist: raise except Exception: log.error("Failed to terminate cluster!", exc_info=True) if not force: log.error("Use -f to forcefully terminate the cluster") raise
def remove_image_files(self, image_name, pretend=True): if pretend: log.info("Pretending to remove image files...") else: log.info('Removing image files...') files = self.get_image_files(image_name) for f in files: if pretend: log.info("Would remove file: %s" % f.name) else: log.info('Removing file %s' % f.name) f.delete() if not pretend: files = self.get_image_files(image_name) if len(files) != 0: log.warn('Not all files deleted, recursing...') self.remove_image_files(image_name, pretend)
def __init__(self, interval=60, plot=False, max_nodes=5, wait_time=900, add_pi=1, kill_after=45, stab=180, lookback_win=3, min_nodes=1): self._cluster = None self.polling_interval = interval self._visualizer_on = plot self.max_nodes = max_nodes self.longest_allowed_queue_time = wait_time self.add_nodes_per_iteration = add_pi self.kill_after = kill_after self.stabilization_time = stab self.lookback_window = lookback_win self.min_nodes = min_nodes self.allow_master_kill = False if self.longest_allowed_queue_time < 300: log.warn("wait_time should be >= 300 seconds " + \ "(it takes ~5 min to launch a new EC2 node)")
def run(self, nodes, master, user, shell, volumes): mssh = master.ssh mssh.switch_user(user) botocfg = '/home/%s/.boto' % user if not mssh.path_exists(botocfg): log.info("Installing AWS credentials for user: %s" % user) if self.boto_cfg: log.info("Copying %s to %s" % (self.boto_cfg, botocfg)) mssh.put(self.boto_cfg, botocfg) else: log.info("Installing current credentials to: %s" % botocfg) f = mssh.remote_file(botocfg, 'w') f.write(BOTO_CFG_TEMPLATE % master.ec2.__dict__) f.close() mssh.chmod(0400, botocfg) else: log.warn("AWS credentials already present - skipping install")
def _warn_about_volume_hosts(self): sg = self.ec2.get_group_or_none(static.VOLUME_GROUP) if not sg: return vol_hosts = filter(lambda x: x.state in ['running', 'pending'], sg.instances()) vol_hosts = map(lambda x: x.id, vol_hosts) if vol_hosts: log.warn("There are still volume hosts running: %s" % \ ', '.join(vol_hosts)) log.warn(("Run 'starcluster terminate %s' to terminate *all* " + \ "volume host instances once they're no longer needed") % \ static.VOLUME_GROUP_NAME) else: log.info("No active volume hosts found. Run 'starcluster " + \ "terminate %(g)s' to remove the '%(g)s' group" % \ {'g': static.VOLUME_GROUP_NAME})
def _install_efs_on_node(self, node): if not node.ssh.path_exists(self.mount_point): node.ssh.makedirs(self.mount_point, mode=0777) zone = node.ssh.execute('ec2metadata --availability-zone')[0] region = zone[:-1] name_parts = [zone, self.fs_id, 'efs', region, 'amazonaws', 'com'] efs_dns = '.'.join(name_parts) mount_info = node.ssh.execute('grep %s /proc/mounts' % self.mount_point, raise_on_failure=False, ignore_exit_status=True) cmd = 'mount -t nfs4 -ominorversion=1 %s:/ %s' % (efs_dns, self.mount_point) if mount_info: log.warn('%s is already a mount point' % self.mount_point) log.info(mount_info[0]) else: node.ssh.execute(cmd)
def _add_to_known_hosts(self, node): log.info("Configuring local known_hosts file") user_home = os.path.expanduser('~') khosts = os.path.join(user_home, '.ssh', 'known_hosts') if not os.path.isfile(khosts): log.warn("Unable to configure known_hosts: file does not exist") return contents = open(khosts).read() if node.dns_name not in contents: server_pkey = node.ssh.get_server_public_key() khostsf = open(khosts, 'a') if contents[-1] != '\n': khostsf.write('\n') name_entry = '%s,%s' % (node.dns_name, node.ip_address) khostsf.write(' '.join([name_entry, server_pkey.get_name(), base64.b64encode(str(server_pkey)), '\n'])) khostsf.close()
def get_stats(self): """ This method will ssh to the SGE master and get load & queue stats. It will feed these stats to SGEStats, which parses the XML. It will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc. """ log.debug("starting get_stats") retries = 5 for i in range(retries): try: return self._get_stats() except Exception: log.warn("Failed to retrieve stats (%d/%d):" % (i + 1, retries), exc_info=True) log.warn("Retrying in %ds" % self.polling_interval) time.sleep(self.polling_interval) raise exception.BaseException("Failed to retrieve SGE stats after trying %d times, exiting..." % retries)
def _start_notebook(self, master, user, profile_dir): log.info("Setting up IPython web notebook for user: %s" % user) user_cert = posixpath.join(profile_dir, '%s.pem' % user) ssl_cert = posixpath.join(profile_dir, '%s.pem' % user) if not master.ssh.isfile(user_cert): log.info("Creating SSL certificate for user %s" % user) ssl_subj = "/C=US/ST=SC/L=STAR/O=Dis/CN=%s" % master.dns_name master.ssh.execute( "openssl req -new -newkey rsa:4096 -days 365 " '-nodes -x509 -subj %s -keyout %s -out %s' % (ssl_subj, ssl_cert, ssl_cert)) else: log.info("Using existing SSL certificate...") f = master.ssh.remote_file('%s/ipython_notebook_config.py' % profile_dir) notebook_port = 8888 sha1py = 'from IPython.lib import passwd; print passwd("%s")' sha1cmd = "python -c '%s'" % sha1py sha1pass = master.ssh.execute(sha1cmd % self.notebook_passwd)[0] f.write('\n'.join([ "c = get_config()", "c.IPKernelApp.pylab = 'inline'", "c.NotebookApp.certfile = u'%s'" % ssl_cert, "c.NotebookApp.ip = '*'", "c.NotebookApp.open_browser = False", "c.NotebookApp.password = u'%s'" % sha1pass, "c.NotebookApp.port = %d" % notebook_port, ])) f.close() if self.notebook_directory is not None: if not master.ssh.path_exists(self.notebook_directory): master.ssh.makedirs(self.notebook_directory) master.ssh.execute_async( "ipython notebook --no-browser --notebook-dir='%s'" % self.notebook_directory) else: master.ssh.execute_async("ipython notebook --no-browser") self._authorize_port(master, notebook_port, 'notebook') log.info("IPython notebook URL: https://%s:%s" % (master.dns_name, notebook_port)) log.info("The notebook password is: %s" % self.notebook_passwd) log.warn("Please check your local firewall settings if you're having " "issues connecting to the IPython notebook", extra=dict(__textwrap__=True))
def run(self, nodes, master, user, shell, volumes): self.config_dict["aws_access_key_id"] = master.ec2.aws_access_key_id self.config_dict["aws_secret_access_key"] = master.ec2.aws_secret_access_key mssh = master.ssh mssh.switch_user(user) s3cmd_cfg = "/home/%s/.s3cfg" % user if not mssh.path_exists(s3cmd_cfg): log.info("Configuring s3cmd for user: %s" % user) if self.s3cmd_cfg: log.info("Copying %s to %s" % (self.s3cmd_cfg, s3cmd_cfg)) mssh.put(self.s3cmd_cfg, s3cmd_cfg) else: log.info("Installing new .s3cfg to: %s" % s3cmd_cfg) f = mssh.remote_file(s3cmd_cfg, "w") f.write(s3cmd_cfg_TEMPLATE % self.config_dict) f.close() mssh.chmod(0400, s3cmd_cfg) else: log.warn("~/.s3cfg file already present - skipping install")
def _add_to_known_hosts(self, node): log.info("Configuring local known_hosts file") user_home = os.path.expanduser('~') khosts = os.path.join(user_home, '.ssh', 'known_hosts') if not os.path.isfile(khosts): log.warn("Unable to configure known_hosts: file does not exist") return contents = open(khosts).read() if node.dns_name not in contents: server_pkey = node.ssh.get_server_public_key() khostsf = open(khosts, 'a') if contents[-1] != '\n': khostsf.write('\n') name_entry = '%s,%s' % (node.dns_name, node.ip_address) khostsf.write(' '.join([ name_entry, server_pkey.get_name(), base64.b64encode(str(server_pkey)), '\n' ])) khostsf.close()
def run(self, nodes, master, user, shell, volumes): self.config_dict["aws_access_key_id"] = \ master.ec2.aws_access_key_id self.config_dict["aws_secret_access_key"] = \ master.ec2.aws_secret_access_key mssh = master.ssh mssh.switch_user(user) s3cmd_cfg = "/home/%s/.s3cfg" % user if not mssh.path_exists(s3cmd_cfg): log.info("Configuring s3cmd for user: %s" % user) if self.s3cmd_cfg: log.info("Copying %s to %s" % (self.s3cmd_cfg, s3cmd_cfg)) mssh.put(self.s3cmd_cfg, s3cmd_cfg) else: log.info("Installing new .s3cfg to: %s" % s3cmd_cfg) f = mssh.remote_file(s3cmd_cfg, 'w') f.write(s3cmd_cfg_TEMPLATE % self.config_dict) f.close() mssh.chmod(0400, s3cmd_cfg) else: log.warn("~/.s3cfg file already present - skipping install")
def _warn_about_volume_hosts(self): sg = self.ec2.get_group_or_none(static.VOLUME_GROUP) vol_hosts = [] if sg: vol_hosts = filter(lambda x: x.state in ['running', 'pending'], sg.instances()) if self._instance: vol_hosts.append(self._instance) vol_hosts = list(set([h.id for h in vol_hosts])) if vol_hosts: log.warn("There are still volume hosts running: %s" % ', '.join(vol_hosts)) if not self._instance: log.warn("Run 'starcluster terminate -f %s' to terminate all " "volume host instances" % static.VOLUME_GROUP_NAME, extra=dict(__textwrap__=True)) elif sg: log.info("No active volume hosts found. Run 'starcluster " "terminate -f %(g)s' to remove the '%(g)s' group" % {'g': static.VOLUME_GROUP_NAME}, extra=dict(__textwrap__=True))
def get_stats(self): """ This method will ssh to the SGE master and get load & queue stats. It will feed these stats to SGEStats, which parses the XML. It will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc. """ log.debug("starting get_stats") retries = 5 for i in range(retries): try: return self._get_stats() except Exception: log.warn("Failed to retrieve stats (%d/%d):" % (i + 1, retries), exc_info=True) log.warn("Retrying in %ds" % self.polling_interval) time.sleep(self.polling_interval) raise exception.BaseException( "Failed to retrieve SGE stats after trying %d times, exiting..." % retries)
def on_add_node(self, node, nodes, master, user, user_shell, volumes): log.warn("starting shell") node.ssh.execute( "sudo -u sgeadmin ssh -L8999:localhost:8999 master -fN > /tmp/tunnel.stdout 2> /tmp/tunnel.stderr" ) log.warn("setting up etc/hosts") update_etc_hosts(node) log.warn("done")
def _install_efs_on_node(self, node): if not node.ssh.path_exists(self.mount_point): node.ssh.makedirs(self.mount_point, mode=0777) zone = node.ssh.execute('ec2metadata --availability-zone')[0] region = zone[:-1] name_parts = [zone, self.fs_id, 'efs', region, 'amazonaws', 'com'] efs_dns = '.'.join(name_parts) mount_info = node.ssh.execute('grep %s /proc/mounts' % self.mount_point, raise_on_failure=False, ignore_exit_status=True) if self.mount_options is None: mount_options = ('minorversion=1,rsize=1048576,wsize=1048576' ',hard,timeo=600,retrans=2') else: mount_options = self.mount_options cmd = 'mount -t nfs4 -o %s %s:/ %s' % (mount_options, efs_dns, self.mount_point) if mount_info: log.warn('%s is already a mount point' % self.mount_point) log.info(mount_info[0]) else: node.ssh.execute(cmd)
def load(self): """ Populate this config object from the StarCluster config """ log.debug('Loading config') try: self.globals = self._load_section('global', self.global_settings) except exception.ConfigSectionMissing: pass try: self.aws = self._load_section('aws info', self.aws_settings) except exception.ConfigSectionMissing: log.warn("no [aws info] section found in config") log.warn("attempting to load credentials from environment...") self.aws.update(self.get_aws_from_environ()) self.keys = self._load_sections('key', self.key_settings) self.vols = self._load_sections('volume', self.volume_settings) self.plugins = self._load_sections('plugin', self.plugin_settings) self.permissions = self._load_sections('permission', self.permission_settings) sections = self._get_sections('cluster') self.clusters = self._load_cluster_sections(sections) return self
def _create_image_from_ebs(self, size=15): log.info("Creating new EBS AMI...") imgid = self.ec2.create_image(self.host.id, self.name, self.description) img = self.ec2.get_image(imgid) log.info("New EBS AMI created: %s" % imgid) root_dev = self.host.root_device_name if root_dev in self.host.block_device_mapping: log.info("Fetching block device mapping for %s" % imgid, extra=dict(__nonewline__=True)) s = Spinner() try: s.start() while root_dev not in img.block_device_mapping: img = self.ec2.get_image(imgid) time.sleep(5) finally: s.stop() snapshot_id = img.block_device_mapping[root_dev].snapshot_id snap = self.ec2.get_snapshot(snapshot_id) self.ec2.wait_for_snapshot(snap) else: log.warn("Unable to find root device - cant wait for snapshot") log.info("Waiting for %s to become available..." % imgid, extra=dict(__nonewline__=True)) s = Spinner() try: s.start() while img.state == "pending": time.sleep(15) if img.update() == "failed": raise exception.AWSError( "EBS image creation failed for %s" % imgid) finally: s.stop() return imgid
def setup_crontab(self, master, nodes): cluster_name = master.parent_cluster.name assert cluster_name.startswith(SECURITY_GROUP_PREFIX) cluster_name = cluster_name[len(SECURITY_GROUP_PREFIX):] domain = '%s-heartbeats' % cluster_name # make sure that the domain exists and the user can access it sdbc = boto.sdb.connect_to_region(self.region, aws_access_key_id=self.key, aws_secret_access_key=self.secret) assert sdbc != None try: dom = sdbc.get_domain(domain) except boto.exception.SDBResponseError: log.warn("Creating new domain %s for heartbeats", domain) dom = sdbc.create_domain(domain) log.warn( "Verifying that domain %s is accessible with non-admin credentials", domain) item = dom.get_item('heartbeat') script_template_name = os.path.join( os.path.dirname(os.path.abspath(__file__)), "deadmanswitch-check.template") script_template = open(script_template_name).read() # apply config settings to template script_body = script_template.format(**dict(key=self.key, secret=self.secret, topic=self.topic, domain=domain, region=self.region, cluster_name=cluster_name)) script = tempfile.NamedTemporaryFile("w") script.write(script_body) script.flush() for node in nodes: node.ssh.put(script.name, "/tmp/cluster_scripts/deadmanswitch-check.py") node.ssh.execute( "chmod a+xr /tmp/cluster_scripts/deadmanswitch-check.py") log.warn("Adding cronjob for checking deadmans switch on %s", str(node)) command = "echo '0,10,20,30,40,50 * * * * ubuntu /usr/bin/python /tmp/cluster_scripts/deadmanswitch-check.py > /tmp/deadmanswitch-check.log 2>&1' > /etc/cron.d/cluster-deadmans-switch && service cron reload" node.ssh.execute(command)
def warn_experimental(self, msg, num_secs=10): """ Warn user that an experimental feature is being used Counts down from num_secs before continuing """ sep = '*' * 60 log.warn('\n'.join([sep, msg, sep]), extra=dict(__textwrap__=True)) r = range(1, num_secs + 1) r.reverse() print log.warn("Waiting %d seconds before continuing..." % num_secs) log.warn("Press CTRL-C to cancel...") for i in r: sys.stdout.write('%d...' % i) sys.stdout.flush() time.sleep(1) print