def on_add_node(self, new_node, nodes, master, user, user_shell, volumes): log.info("Adding %s to MPICH2 hosts file" % new_node.alias) mpich2_hosts = master.ssh.remote_file(self.MPICH2_HOSTS, 'a') mpich2_hosts.write(new_node.alias + '\n') mpich2_hosts.close() log.info("Setting MPICH2 as default MPI on %s" % new_node.alias) self._update_alternatives(new_node)
def execute(self, args): if len(args) != 2: self.parser.error( 'you must specify an <image_id> and <destination_directory>') image_id, destdir = args self.ec2.download_image_files(image_id, destdir) log.info("Finished downloading AMI: %s" % image_id)
def execute(self, args): if not args: cls = [ c.cluster_tag for c in self.cm.get_clusters(load_plugins=False, load_receipt=False) ] msg = "please specify a cluster" if cls: opts = ', '.join(cls) msg = " ".join([msg, '(options:', opts, ')']) self.parser.error(msg) for cluster_name in args: try: cl = self.cm.get_cluster(cluster_name) except exception.ClusterDoesNotExist: raise except Exception, e: log.debug("Failed to load cluster settings!", exc_info=True) log.error("Failed to load cluster settings!") if self.opts.force: log.warn("Ignoring cluster settings due to --force option") cl = self.cm.get_cluster(cluster_name, load_receipt=False, require_keys=False) else: if not isinstance(e, exception.IncompatibleCluster): log.error("Use -f to forcefully stop the cluster") raise is_stoppable = cl.is_stoppable() if not is_stoppable: has_stoppable_nodes = cl.has_stoppable_nodes() if not self.opts.terminate_unstoppable and has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' contains 'stoppable' and 'unstoppable' " "nodes. Your options are:\n\n" "1. Use the --terminate-unstoppable option to " "stop all 'stoppable' nodes and terminate all " "'unstoppable' nodes\n\n" "2. Use the 'terminate' command to destroy the " "cluster.\n\nPass --help for more info." % cluster_name) if not has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' does not contain any 'stoppable' nodes " "and can only be terminated. Please use the " "'terminate' command instead to destroy the cluster." "\n\nPass --help for more info" % cluster_name) if not self.opts.confirm: resp = raw_input("Stop cluster %s (y/n)? " % cluster_name) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster(self.opts.terminate_unstoppable, force=self.opts.force) log.warn("All non-spot, EBS-backed nodes are now in a " "'stopped' state") log.warn("You can restart this cluster by passing -x " "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " "terminate this cluster")
def execute(self, args): if "createimage" in sys.argv: warnings.warn( "createimage is deprecated and will go away in the " "next release. please use the s3image/ebsimage " "commands instead", DeprecationWarning) if len(args) != 3: self.parser.error( 'you must specify an instance-id, image name, and bucket') bucket = None instanceid, image_name, bucket = args self.bucket = bucket self.image_name = image_name i = self.ec2.get_instance(instanceid) key_location = self.cfg.get_key(i.key_name).get('key_location') aws_user_id = self.cfg.aws.get('aws_user_id') ec2_cert = self.cfg.aws.get('ec2_cert') ec2_private_key = self.cfg.aws.get('ec2_private_key') try: ami_id = self.ec2.create_s3_image(instanceid, key_location, aws_user_id, ec2_cert, ec2_private_key, bucket, image_name=image_name, **self.specified_options_dict) log.info("Your new AMI id is: %s" % ami_id) except KeyboardInterrupt: raise exception.CancelledS3ImageCreation(self.bucket, self.image_name)
def run(self, nodes, master, user, user_shell, volumes): self._check_ipython_installed(master) user_home = master.getpwnam(user).pw_dir profile_dir = posixpath.join(user_home, '.ipython', 'profile_default') master.ssh.switch_user(user) self._write_config(master, user, profile_dir) # Start the cluster and some engines on the master (leave 1 # processor free to handle cluster house keeping) cfile, n_engines_master = self._start_cluster(master, profile_dir) # Start engines on each of the non-master nodes non_master_nodes = [node for node in nodes if not node.is_master()] for node in non_master_nodes: self.pool.simple_job(_start_engines, (node, user, node.num_processors), jobid=node.alias) n_engines_non_master = sum(node.num_processors for node in non_master_nodes) if len(non_master_nodes) > 0: log.info("Adding %d engines on %d nodes", n_engines_non_master, len(non_master_nodes)) self.pool.wait(len(non_master_nodes)) if self.enable_notebook: self._start_notebook(master, user, profile_dir) n_engines_total = n_engines_master + n_engines_non_master log.info(STARTED_MSG % dict(cluster=master.parent_cluster, user=user, connector_file=cfile, key_location=master.key_location, n_engines=n_engines_total, n_nodes=len(nodes))) master.ssh.switch_user('root')
def cancel_command(self, signum, frame): """ Exits program with return value of 1 """ print log.info("Exiting...") sys.exit(1)
def run(self, nodes, master, user, user_shell, volumes): self._nodes = nodes self._master = master self._user = user self._user_shell = user_shell self._volumes = volumes log.info("Creating %d cluster users" % self._num_users) newusers = self._get_newusers_batch_file(master, self._usernames, user_shell) for node in nodes: self.pool.simple_job(node.ssh.execute, ("echo -n '%s' | newusers" % newusers), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Configuring passwordless ssh for %d cluster users" % self._num_users) pbar = self.pool.progress_bar.reset() pbar.maxval = self._num_users for i, user in enumerate(self._usernames): master.generate_key_for_user(user, auth_new_key=True, auth_conn_key=True) master.add_to_known_hosts(user, nodes) pbar.update(i + 1) pbar.finish() self._setup_scratch(nodes, self._usernames) if self._download_keys: self._download_user_keys(master, self._usernames)
def _create_sge_pe(self, name="orte", nodes=None, queue="all.q"): """ Create or update an SGE parallel environment name - name of parallel environment nodes - list of nodes to include in the parallel environment (default: all) queue - configure queue to use the new parallel environment """ mssh = self._master.ssh pe_exists = mssh.get_status('qconf -sp %s' % name) == 0 verb = 'Updating' if pe_exists else 'Creating' log.info("%s SGE parallel environment '%s'" % (verb, name)) if not nodes: nodes = self._nodes if self.master_is_exec_host else self.nodes if self.slots_per_host is None: pe_slots = sum( self.pool.map(lambda n: n.num_processors, nodes, jobid_fn=lambda n: n.alias)) else: pe_slots = self.slots_per_host * len(nodes) if not pe_exists: penv = mssh.remote_file("/tmp/pe.txt", "w") penv.write(sge.sge_pe_template % (name, pe_slots)) penv.close() mssh.execute("qconf -Ap %s" % penv.name) else: mssh.execute("qconf -mattr pe slots %s %s" % (pe_slots, name)) if queue: log.info("Adding parallel environment '%s' to queue '%s'" % (name, queue)) mssh.execute('qconf -mattr queue pe_list "%s" %s' % (name, queue))
def on_add_node(self, node, nodes, master, user, user_shell, volumes): self._nodes = nodes self._master = master self._user = user self._user_shell = user_shell self._volumes = volumes log.info("Adding %s to Condor" % node.alias) self._add_condor_node(node)
def _setup_etc_hosts(self, nodes=None): """ Configure /etc/hosts on all TethysCluster nodes""" log.info("Configuring /etc/hosts on each node") nodes = nodes or self._nodes for node in nodes: self.pool.simple_job(node.add_to_etc_hosts, (nodes, ), jobid=node.alias) self.pool.wait(numtasks=len(nodes))
def _upload_image(self): log.info('Uploading bundled image: (please be patient)') conn = self.host_ssh config_dict = self.config_dict conn.execute('ec2-upload-bundle -b %(bucket)s ' '-m /mnt/%(prefix)s.manifest.xml -a %(access_key)s ' '-s %(secret_key)s' % config_dict, silent=False)
def _setup_ebs_volumes(self): """ Mount EBS volumes, if specified in ~/.tethyscluster/config to /home """ # setup /etc/fstab on master to use block device if specified master = self._master devices = master.get_device_map() for vol in self._volumes: vol = self._volumes[vol] vol_id = vol.get("volume_id") mount_path = vol.get('mount_path') device = vol.get("device") volume_partition = vol.get('partition') if not (vol_id and device and mount_path): log.error("missing required settings for vol %s" % vol) continue if device not in devices and device.startswith('/dev/sd'): # check for "correct" device in unpatched kernels device = device.replace('/dev/sd', '/dev/xvd') if device not in devices: log.warn("Cannot find device %s for volume %s" % (device, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This usually means there was a problem " "attaching the EBS volume to the master node") continue partitions = master.get_partition_map(device=device) if not volume_partition: if len(partitions) == 0: volume_partition = device elif len(partitions) == 1: volume_partition = partitions.popitem()[0] else: log.error( "volume has more than one partition, please specify " "which partition to use (e.g. partition=0, " "partition=1, etc.) in the volume's config") continue elif volume_partition not in partitions: log.warn("Cannot find partition %s on volume %s" % (volume_partition, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This either means that the volume has not " "been partitioned or that the partition " "specified does not exist on the volume") continue log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path)) mount_map = master.get_mount_map() if volume_partition in mount_map: path, fstype, options = mount_map.get(volume_partition) if path != mount_path: log.error("Volume %s is mounted on %s, not on %s" % (vol_id, path, mount_path)) else: log.info("Volume %s already mounted on %s...skipping" % (vol_id, mount_path)) continue master.mount_device(volume_partition, mount_path)
def _load_dsa_key(self, private_key, private_key_pass=None): private_key_file = os.path.expanduser(private_key) try: dsa_key = get_dsa_key(key_location=private_key_file, passphrase=private_key_pass) log.info("Using private key %s (DSA)" % private_key) return dsa_key except (paramiko.SSHException, exception.SSHError): log.error('invalid dsa key or passphrase specified')
def _setup_hostnames(self, nodes=None): """ Set each node's hostname to their alias. """ nodes = nodes or self._nodes log.info("Configuring hostnames...") for node in nodes: self.pool.simple_job(node.set_hostname, (), jobid=node.alias) self.pool.wait(numtasks=len(nodes))
def on_remove_node(self, node, nodes, master, user, user_shell, volumes): self._nodes = nodes self._master = master self._user = user self._user_shell = user_shell self._volumes = volumes log.info("Removing %s from Condor peacefully..." % node.alias) master.ssh.execute("condor_off -peaceful %s" % node.alias) node.ssh.execute("pkill condor", ignore_exit_status=True)
def install_packages(self, nodes, dest='all nodes'): log.info("Installing Python packages on %s:" % dest) commands = [self.install_command % p for p in self.packages] for command in commands: log.info("$ " + command) cmd = "\n".join(commands) for node in nodes: self.pool.simple_job(node.ssh.execute, (cmd, ), jobid=node.alias) self.pool.wait(len(nodes))
def run(self, nodes, master, user, user_shell, volumes): log.info("Installing Xvfb on all nodes") for node in nodes: self.pool.simple_job(self._install_xvfb, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Launching Xvfb Server on all nodes") for node in nodes: self.pool.simple_job(self._launch_xvfb, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes))
def on_remove_node(self, node, nodes, master, user, user_shell, volumes): self._nodes = nodes self._master = master self._user = user self._user_shell = user_shell self._volumes = volumes log.info("Removing %s from SGE" % node.alias) self._remove_from_sge(node) self._remove_nfs_exports(node)
def has_cluster_stabilized(self): now = utils.get_utc_now() elapsed = (now - self.__last_cluster_mod_time).seconds is_stabilized = not (elapsed < self.stabilization_time) if not is_stabilized: log.info("Cluster was modified less than %d seconds ago" % self.stabilization_time) log.info("Waiting for cluster to stabilize...") return is_stabilized
def run(self, nodes, master, user, user_shell, volumes): log.info("Starting TMUX Control Center...") self._nodes = nodes self._master = master self._user = user self._user_shell = user_shell self._volumes = volumes self.add_to_utmp_group(master, user) self.setup_tmuxcc(user='******') self.setup_tmuxcc(user=user)
def run(self, nodes, master, user, user_shell, volumes): if not master.ssh.isdir(self.SGE_FRESH): log.error("SGE is not installed on this AMI, skipping...") return log.info("Configuring SGE...") self._nodes = nodes self._master = master self._user = user self._user_shell = user_shell self._volumes = volumes self._setup_sge()
def run(self, nodes, master, user, user_shell, volumes): n_total = 0 for node in nodes: n_engines = node.num_processors if node.is_master() and n_engines > 2: n_engines -= 1 self.pool.simple_job(_start_engines, (node, user, n_engines, True), jobid=node.alias) n_total += n_engines log.info("Restarting %d engines on %d nodes", n_total, len(nodes)) self.pool.wait(len(nodes))
def _setup_scratch(self, nodes=None, users=None): """ Configure scratch space on all TethysCluster nodes """ users = users or [self._user] log.info("Configuring scratch space for user(s): %s" % ', '.join(users), extra=dict(__textwrap__=True)) nodes = nodes or self._nodes for node in nodes: self.pool.simple_job(self._setup_scratch_on_node, (node, users), jobid=node.alias) self.pool.wait(numtasks=len(nodes))
def interactive_shell(self, user='******'): orig_user = self.get_current_user() self.switch_user(user) chan = self._invoke_shell() log.info('Starting Pure-Python SSH shell...') if HAS_TERMIOS: self._posix_shell(chan) else: self._windows_shell(chan) chan.close() self.switch_user(orig_user)
def _terminate_cluster(self, cl): if not self.opts.confirm: action = 'Terminate' if cl.is_ebs_cluster(): action = 'Terminate EBS' resp = raw_input("%s cluster %s (y/n)? " % (action, cl.cluster_tag)) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") return cl.terminate_cluster()
def create_image(self): log.info("Checking for EC2 API tools...") self.host_ssh.check_required(['ec2-upload-bundle', 'ec2-bundle-vol']) self.ec2.s3.get_or_create_bucket(self.bucket) self._remove_image_files() self._bundle_image() self._upload_image() ami_id = self._register_image() if self.remove_image_files: self._remove_image_files() return ami_id
def _terminate_manually(self, cl): if not self.opts.confirm: resp = raw_input("Terminate cluster %s (y/n)? " % cl.cluster_tag) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") return insts = cl.cluster_group.instances() for inst in insts: log.info("Terminating %s" % (inst.id, )) inst.terminate() cl.terminate_cluster(force=True)
def _bundle_image(self): # run script to prepare the host conn = self.host_ssh config_dict = self.config_dict self._transfer_pem_files() self.clean_private_data() log.info('Creating the bundled image: (please be patient)') conn.execute('ec2-bundle-vol -d /mnt -k /mnt/%(private_key)s ' '-c /mnt/%(cert)s -p %(prefix)s -u %(userid)s ' '-r %(arch)s -e /root/.ssh -B %(bmap)s' % config_dict, silent=False) self._cleanup_pem_files()
def graph_all(self): self.read() vals = {'queued': self.records.queued_jobs, 'running': self.records.running_jobs, 'num_hosts': self.records.hosts, # 'slots': self.records.slots, 'avg_duration': self.records.avg_duration, 'avg_wait': self.records.avg_wait, 'avg_load': self.records.avg_load} for sub in vals: self.graph(vals[sub], sub) log.info("Done making graphs.")
def _mkdir(self, directory, makedirs=False): if not os.path.isdir(directory): if os.path.isfile(directory): raise exception.BaseException("'%s' is a file not a directory") try: if makedirs: os.makedirs(directory) log.info("Created directories %s" % directory) else: os.mkdir(directory) log.info("Created single directory %s" % directory) except IOError, e: raise exception.BaseException(str(e))