Exemple #1
0
 def on_add_node(self, new_node, nodes, master, user, user_shell, volumes):
     log.info("Adding %s to MPICH2 hosts file" % new_node.alias)
     mpich2_hosts = master.ssh.remote_file(self.MPICH2_HOSTS, 'a')
     mpich2_hosts.write(new_node.alias + '\n')
     mpich2_hosts.close()
     log.info("Setting MPICH2 as default MPI on %s" % new_node.alias)
     self._update_alternatives(new_node)
Exemple #2
0
 def execute(self, args):
     if len(args) != 2:
         self.parser.error(
             'you must specify an <image_id> and <destination_directory>')
     image_id, destdir = args
     self.ec2.download_image_files(image_id, destdir)
     log.info("Finished downloading AMI: %s" % image_id)
Exemple #3
0
 def execute(self, args):
     if not args:
         cls = [
             c.cluster_tag for c in self.cm.get_clusters(load_plugins=False,
                                                         load_receipt=False)
         ]
         msg = "please specify a cluster"
         if cls:
             opts = ', '.join(cls)
             msg = " ".join([msg, '(options:', opts, ')'])
         self.parser.error(msg)
     for cluster_name in args:
         try:
             cl = self.cm.get_cluster(cluster_name)
         except exception.ClusterDoesNotExist:
             raise
         except Exception, e:
             log.debug("Failed to load cluster settings!", exc_info=True)
             log.error("Failed to load cluster settings!")
             if self.opts.force:
                 log.warn("Ignoring cluster settings due to --force option")
                 cl = self.cm.get_cluster(cluster_name,
                                          load_receipt=False,
                                          require_keys=False)
             else:
                 if not isinstance(e, exception.IncompatibleCluster):
                     log.error("Use -f to forcefully stop the cluster")
                 raise
         is_stoppable = cl.is_stoppable()
         if not is_stoppable:
             has_stoppable_nodes = cl.has_stoppable_nodes()
             if not self.opts.terminate_unstoppable and has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' contains 'stoppable' and 'unstoppable' "
                     "nodes. Your options are:\n\n"
                     "1. Use the --terminate-unstoppable option to "
                     "stop all 'stoppable' nodes and terminate all "
                     "'unstoppable' nodes\n\n"
                     "2. Use the 'terminate' command to destroy the "
                     "cluster.\n\nPass --help for more info." %
                     cluster_name)
             if not has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' does not contain any 'stoppable' nodes "
                     "and can only be terminated. Please use the "
                     "'terminate' command instead to destroy the cluster."
                     "\n\nPass --help for more info" % cluster_name)
         if not self.opts.confirm:
             resp = raw_input("Stop cluster %s (y/n)? " % cluster_name)
             if resp not in ['y', 'Y', 'yes']:
                 log.info("Aborting...")
                 continue
         cl.stop_cluster(self.opts.terminate_unstoppable,
                         force=self.opts.force)
         log.warn("All non-spot, EBS-backed nodes are now in a "
                  "'stopped' state")
         log.warn("You can restart this cluster by passing -x "
                  "to the 'start' command")
         log.warn("Use the 'terminate' command to *completely* "
                  "terminate this cluster")
Exemple #4
0
 def execute(self, args):
     if "createimage" in sys.argv:
         warnings.warn(
             "createimage is deprecated and will go away in the "
             "next release. please use the s3image/ebsimage "
             "commands instead", DeprecationWarning)
     if len(args) != 3:
         self.parser.error(
             'you must specify an instance-id, image name, and bucket')
     bucket = None
     instanceid, image_name, bucket = args
     self.bucket = bucket
     self.image_name = image_name
     i = self.ec2.get_instance(instanceid)
     key_location = self.cfg.get_key(i.key_name).get('key_location')
     aws_user_id = self.cfg.aws.get('aws_user_id')
     ec2_cert = self.cfg.aws.get('ec2_cert')
     ec2_private_key = self.cfg.aws.get('ec2_private_key')
     try:
         ami_id = self.ec2.create_s3_image(instanceid,
                                           key_location,
                                           aws_user_id,
                                           ec2_cert,
                                           ec2_private_key,
                                           bucket,
                                           image_name=image_name,
                                           **self.specified_options_dict)
         log.info("Your new AMI id is: %s" % ami_id)
     except KeyboardInterrupt:
         raise exception.CancelledS3ImageCreation(self.bucket,
                                                  self.image_name)
Exemple #5
0
 def run(self, nodes, master, user, user_shell, volumes):
     self._check_ipython_installed(master)
     user_home = master.getpwnam(user).pw_dir
     profile_dir = posixpath.join(user_home, '.ipython', 'profile_default')
     master.ssh.switch_user(user)
     self._write_config(master, user, profile_dir)
     # Start the cluster and some engines on the master (leave 1
     # processor free to handle cluster house keeping)
     cfile, n_engines_master = self._start_cluster(master, profile_dir)
     # Start engines on each of the non-master nodes
     non_master_nodes = [node for node in nodes if not node.is_master()]
     for node in non_master_nodes:
         self.pool.simple_job(_start_engines,
                              (node, user, node.num_processors),
                              jobid=node.alias)
     n_engines_non_master = sum(node.num_processors
                                for node in non_master_nodes)
     if len(non_master_nodes) > 0:
         log.info("Adding %d engines on %d nodes", n_engines_non_master,
                  len(non_master_nodes))
         self.pool.wait(len(non_master_nodes))
     if self.enable_notebook:
         self._start_notebook(master, user, profile_dir)
     n_engines_total = n_engines_master + n_engines_non_master
     log.info(STARTED_MSG % dict(cluster=master.parent_cluster,
                                 user=user,
                                 connector_file=cfile,
                                 key_location=master.key_location,
                                 n_engines=n_engines_total,
                                 n_nodes=len(nodes)))
     master.ssh.switch_user('root')
Exemple #6
0
 def cancel_command(self, signum, frame):
     """
     Exits program with return value of 1
     """
     print
     log.info("Exiting...")
     sys.exit(1)
Exemple #7
0
 def run(self, nodes, master, user, user_shell, volumes):
     self._nodes = nodes
     self._master = master
     self._user = user
     self._user_shell = user_shell
     self._volumes = volumes
     log.info("Creating %d cluster users" % self._num_users)
     newusers = self._get_newusers_batch_file(master, self._usernames,
                                              user_shell)
     for node in nodes:
         self.pool.simple_job(node.ssh.execute,
                              ("echo -n '%s' | newusers" % newusers),
                              jobid=node.alias)
     self.pool.wait(numtasks=len(nodes))
     log.info("Configuring passwordless ssh for %d cluster users" %
              self._num_users)
     pbar = self.pool.progress_bar.reset()
     pbar.maxval = self._num_users
     for i, user in enumerate(self._usernames):
         master.generate_key_for_user(user, auth_new_key=True,
                                      auth_conn_key=True)
         master.add_to_known_hosts(user, nodes)
         pbar.update(i + 1)
     pbar.finish()
     self._setup_scratch(nodes, self._usernames)
     if self._download_keys:
         self._download_user_keys(master, self._usernames)
Exemple #8
0
    def _create_sge_pe(self, name="orte", nodes=None, queue="all.q"):
        """
        Create or update an SGE parallel environment

        name - name of parallel environment
        nodes - list of nodes to include in the parallel environment
                (default: all)
        queue - configure queue to use the new parallel environment
        """
        mssh = self._master.ssh
        pe_exists = mssh.get_status('qconf -sp %s' % name) == 0
        verb = 'Updating' if pe_exists else 'Creating'
        log.info("%s SGE parallel environment '%s'" % (verb, name))
        if not nodes:
            nodes = self._nodes if self.master_is_exec_host else self.nodes
        if self.slots_per_host is None:
            pe_slots = sum(
                self.pool.map(lambda n: n.num_processors,
                              nodes,
                              jobid_fn=lambda n: n.alias))
        else:
            pe_slots = self.slots_per_host * len(nodes)
        if not pe_exists:
            penv = mssh.remote_file("/tmp/pe.txt", "w")
            penv.write(sge.sge_pe_template % (name, pe_slots))
            penv.close()
            mssh.execute("qconf -Ap %s" % penv.name)
        else:
            mssh.execute("qconf -mattr pe slots %s %s" % (pe_slots, name))
        if queue:
            log.info("Adding parallel environment '%s' to queue '%s'" %
                     (name, queue))
            mssh.execute('qconf -mattr queue pe_list "%s" %s' % (name, queue))
Exemple #9
0
 def on_add_node(self, node, nodes, master, user, user_shell, volumes):
     self._nodes = nodes
     self._master = master
     self._user = user
     self._user_shell = user_shell
     self._volumes = volumes
     log.info("Adding %s to Condor" % node.alias)
     self._add_condor_node(node)
 def _setup_etc_hosts(self, nodes=None):
     """ Configure /etc/hosts on all TethysCluster nodes"""
     log.info("Configuring /etc/hosts on each node")
     nodes = nodes or self._nodes
     for node in nodes:
         self.pool.simple_job(node.add_to_etc_hosts, (nodes, ),
                              jobid=node.alias)
     self.pool.wait(numtasks=len(nodes))
Exemple #11
0
 def _upload_image(self):
     log.info('Uploading bundled image: (please be patient)')
     conn = self.host_ssh
     config_dict = self.config_dict
     conn.execute('ec2-upload-bundle -b %(bucket)s '
                  '-m /mnt/%(prefix)s.manifest.xml -a %(access_key)s '
                  '-s %(secret_key)s' % config_dict,
                  silent=False)
 def _setup_ebs_volumes(self):
     """
     Mount EBS volumes, if specified in ~/.tethyscluster/config to /home
     """
     # setup /etc/fstab on master to use block device if specified
     master = self._master
     devices = master.get_device_map()
     for vol in self._volumes:
         vol = self._volumes[vol]
         vol_id = vol.get("volume_id")
         mount_path = vol.get('mount_path')
         device = vol.get("device")
         volume_partition = vol.get('partition')
         if not (vol_id and device and mount_path):
             log.error("missing required settings for vol %s" % vol)
             continue
         if device not in devices and device.startswith('/dev/sd'):
             # check for "correct" device in unpatched kernels
             device = device.replace('/dev/sd', '/dev/xvd')
             if device not in devices:
                 log.warn("Cannot find device %s for volume %s" %
                          (device, vol_id))
                 log.warn("Not mounting %s on %s" % (vol_id, mount_path))
                 log.warn("This usually means there was a problem "
                          "attaching the EBS volume to the master node")
                 continue
         partitions = master.get_partition_map(device=device)
         if not volume_partition:
             if len(partitions) == 0:
                 volume_partition = device
             elif len(partitions) == 1:
                 volume_partition = partitions.popitem()[0]
             else:
                 log.error(
                     "volume has more than one partition, please specify "
                     "which partition to use (e.g. partition=0, "
                     "partition=1, etc.) in the volume's config")
                 continue
         elif volume_partition not in partitions:
             log.warn("Cannot find partition %s on volume %s" %
                      (volume_partition, vol_id))
             log.warn("Not mounting %s on %s" % (vol_id, mount_path))
             log.warn("This either means that the volume has not "
                      "been partitioned or that the partition "
                      "specified does not exist on the volume")
             continue
         log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
         mount_map = master.get_mount_map()
         if volume_partition in mount_map:
             path, fstype, options = mount_map.get(volume_partition)
             if path != mount_path:
                 log.error("Volume %s is mounted on %s, not on %s" %
                           (vol_id, path, mount_path))
             else:
                 log.info("Volume %s already mounted on %s...skipping" %
                          (vol_id, mount_path))
             continue
         master.mount_device(volume_partition, mount_path)
 def _load_dsa_key(self, private_key, private_key_pass=None):
     private_key_file = os.path.expanduser(private_key)
     try:
         dsa_key = get_dsa_key(key_location=private_key_file,
                               passphrase=private_key_pass)
         log.info("Using private key %s (DSA)" % private_key)
         return dsa_key
     except (paramiko.SSHException, exception.SSHError):
         log.error('invalid dsa key or passphrase specified')
 def _setup_hostnames(self, nodes=None):
     """
     Set each node's hostname to their alias.
     """
     nodes = nodes or self._nodes
     log.info("Configuring hostnames...")
     for node in nodes:
         self.pool.simple_job(node.set_hostname, (), jobid=node.alias)
     self.pool.wait(numtasks=len(nodes))
Exemple #15
0
 def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
     self._nodes = nodes
     self._master = master
     self._user = user
     self._user_shell = user_shell
     self._volumes = volumes
     log.info("Removing %s from Condor peacefully..." % node.alias)
     master.ssh.execute("condor_off -peaceful %s" % node.alias)
     node.ssh.execute("pkill condor", ignore_exit_status=True)
Exemple #16
0
 def install_packages(self, nodes, dest='all nodes'):
     log.info("Installing Python packages on %s:" % dest)
     commands = [self.install_command % p for p in self.packages]
     for command in commands:
         log.info("$ " + command)
     cmd = "\n".join(commands)
     for node in nodes:
         self.pool.simple_job(node.ssh.execute, (cmd, ), jobid=node.alias)
     self.pool.wait(len(nodes))
Exemple #17
0
 def run(self, nodes, master, user, user_shell, volumes):
     log.info("Installing Xvfb on all nodes")
     for node in nodes:
         self.pool.simple_job(self._install_xvfb, (node), jobid=node.alias)
     self.pool.wait(numtasks=len(nodes))
     log.info("Launching Xvfb Server on all nodes")
     for node in nodes:
         self.pool.simple_job(self._launch_xvfb, (node), jobid=node.alias)
     self.pool.wait(numtasks=len(nodes))
Exemple #18
0
 def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
     self._nodes = nodes
     self._master = master
     self._user = user
     self._user_shell = user_shell
     self._volumes = volumes
     log.info("Removing %s from SGE" % node.alias)
     self._remove_from_sge(node)
     self._remove_nfs_exports(node)
Exemple #19
0
 def has_cluster_stabilized(self):
     now = utils.get_utc_now()
     elapsed = (now - self.__last_cluster_mod_time).seconds
     is_stabilized = not (elapsed < self.stabilization_time)
     if not is_stabilized:
         log.info("Cluster was modified less than %d seconds ago" %
                  self.stabilization_time)
         log.info("Waiting for cluster to stabilize...")
     return is_stabilized
Exemple #20
0
 def run(self, nodes, master, user, user_shell, volumes):
     log.info("Starting TMUX Control Center...")
     self._nodes = nodes
     self._master = master
     self._user = user
     self._user_shell = user_shell
     self._volumes = volumes
     self.add_to_utmp_group(master, user)
     self.setup_tmuxcc(user='******')
     self.setup_tmuxcc(user=user)
Exemple #21
0
 def run(self, nodes, master, user, user_shell, volumes):
     if not master.ssh.isdir(self.SGE_FRESH):
         log.error("SGE is not installed on this AMI, skipping...")
         return
     log.info("Configuring SGE...")
     self._nodes = nodes
     self._master = master
     self._user = user
     self._user_shell = user_shell
     self._volumes = volumes
     self._setup_sge()
Exemple #22
0
 def run(self, nodes, master, user, user_shell, volumes):
     n_total = 0
     for node in nodes:
         n_engines = node.num_processors
         if node.is_master() and n_engines > 2:
             n_engines -= 1
         self.pool.simple_job(_start_engines, (node, user, n_engines, True),
                              jobid=node.alias)
         n_total += n_engines
     log.info("Restarting %d engines on %d nodes", n_total, len(nodes))
     self.pool.wait(len(nodes))
 def _setup_scratch(self, nodes=None, users=None):
     """ Configure scratch space on all TethysCluster nodes """
     users = users or [self._user]
     log.info("Configuring scratch space for user(s): %s" %
              ', '.join(users),
              extra=dict(__textwrap__=True))
     nodes = nodes or self._nodes
     for node in nodes:
         self.pool.simple_job(self._setup_scratch_on_node, (node, users),
                              jobid=node.alias)
     self.pool.wait(numtasks=len(nodes))
 def interactive_shell(self, user='******'):
     orig_user = self.get_current_user()
     self.switch_user(user)
     chan = self._invoke_shell()
     log.info('Starting Pure-Python SSH shell...')
     if HAS_TERMIOS:
         self._posix_shell(chan)
     else:
         self._windows_shell(chan)
     chan.close()
     self.switch_user(orig_user)
Exemple #25
0
 def _terminate_cluster(self, cl):
     if not self.opts.confirm:
         action = 'Terminate'
         if cl.is_ebs_cluster():
             action = 'Terminate EBS'
         resp = raw_input("%s cluster %s (y/n)? " %
                          (action, cl.cluster_tag))
         if resp not in ['y', 'Y', 'yes']:
             log.info("Aborting...")
             return
     cl.terminate_cluster()
Exemple #26
0
 def create_image(self):
     log.info("Checking for EC2 API tools...")
     self.host_ssh.check_required(['ec2-upload-bundle', 'ec2-bundle-vol'])
     self.ec2.s3.get_or_create_bucket(self.bucket)
     self._remove_image_files()
     self._bundle_image()
     self._upload_image()
     ami_id = self._register_image()
     if self.remove_image_files:
         self._remove_image_files()
     return ami_id
Exemple #27
0
 def _terminate_manually(self, cl):
     if not self.opts.confirm:
         resp = raw_input("Terminate cluster %s (y/n)? " % cl.cluster_tag)
         if resp not in ['y', 'Y', 'yes']:
             log.info("Aborting...")
             return
     insts = cl.cluster_group.instances()
     for inst in insts:
         log.info("Terminating %s" % (inst.id, ))
         inst.terminate()
     cl.terminate_cluster(force=True)
Exemple #28
0
 def _bundle_image(self):
     # run script to prepare the host
     conn = self.host_ssh
     config_dict = self.config_dict
     self._transfer_pem_files()
     self.clean_private_data()
     log.info('Creating the bundled image: (please be patient)')
     conn.execute('ec2-bundle-vol -d /mnt -k /mnt/%(private_key)s '
                  '-c /mnt/%(cert)s -p %(prefix)s -u %(userid)s '
                  '-r %(arch)s -e /root/.ssh -B %(bmap)s' % config_dict,
                  silent=False)
     self._cleanup_pem_files()
Exemple #29
0
 def graph_all(self):
     self.read()
     vals = {'queued': self.records.queued_jobs,
             'running': self.records.running_jobs,
             'num_hosts': self.records.hosts,
             # 'slots': self.records.slots,
             'avg_duration': self.records.avg_duration,
             'avg_wait': self.records.avg_wait,
             'avg_load': self.records.avg_load}
     for sub in vals:
         self.graph(vals[sub], sub)
     log.info("Done making graphs.")
Exemple #30
0
 def _mkdir(self, directory, makedirs=False):
     if not os.path.isdir(directory):
         if os.path.isfile(directory):
             raise exception.BaseException("'%s' is a file not a directory")
         try:
             if makedirs:
                 os.makedirs(directory)
                 log.info("Created directories %s" % directory)
             else:
                 os.mkdir(directory)
                 log.info("Created single directory %s" % directory)
         except IOError, e:
             raise exception.BaseException(str(e))