Beispiel #1
0
 def __init__(self, num_users=None, usernames=None, download_keys=None,
              download_keys_dir=None):
     if usernames:
         usernames = [user.strip() for user in usernames.split(',')]
     if num_users:
         try:
             num_users = int(num_users)
         except ValueError:
             raise exception.BaseException("num_users must be an integer")
     elif usernames:
         num_users = len(usernames)
     else:
         raise exception.BaseException(
             "you must provide num_users or usernames or both")
     if usernames and num_users and len(usernames) != num_users:
         raise exception.BaseException(
             "only %d usernames provided - %d required" %
             (len(usernames), num_users))
     self._num_users = num_users
     if not usernames:
         usernames = ['user%.3d' % i for i in range(1, num_users + 1)]
     self._usernames = usernames
     self._download_keys = str(download_keys).lower() == "true"
     self._download_keys_dir = download_keys_dir or self.DOWNLOAD_KEYS_DIR
     super(CreateUsers, self).__init__()
Beispiel #2
0
 def execute(self, args):
     if not args:
         cls = [
             c.cluster_tag for c in self.cm.get_clusters(load_plugins=False,
                                                         load_receipt=False)
         ]
         msg = "please specify a cluster"
         if cls:
             opts = ', '.join(cls)
             msg = " ".join([msg, '(options:', opts, ')'])
         self.parser.error(msg)
     for cluster_name in args:
         try:
             cl = self.cm.get_cluster(cluster_name)
         except exception.ClusterDoesNotExist:
             raise
         except Exception, e:
             log.debug("Failed to load cluster settings!", exc_info=True)
             log.error("Failed to load cluster settings!")
             if self.opts.force:
                 log.warn("Ignoring cluster settings due to --force option")
                 cl = self.cm.get_cluster(cluster_name,
                                          load_receipt=False,
                                          require_keys=False)
             else:
                 if not isinstance(e, exception.IncompatibleCluster):
                     log.error("Use -f to forcefully stop the cluster")
                 raise
         is_stoppable = cl.is_stoppable()
         if not is_stoppable:
             has_stoppable_nodes = cl.has_stoppable_nodes()
             if not self.opts.terminate_unstoppable and has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' contains 'stoppable' and 'unstoppable' "
                     "nodes. Your options are:\n\n"
                     "1. Use the --terminate-unstoppable option to "
                     "stop all 'stoppable' nodes and terminate all "
                     "'unstoppable' nodes\n\n"
                     "2. Use the 'terminate' command to destroy the "
                     "cluster.\n\nPass --help for more info." %
                     cluster_name)
             if not has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' does not contain any 'stoppable' nodes "
                     "and can only be terminated. Please use the "
                     "'terminate' command instead to destroy the cluster."
                     "\n\nPass --help for more info" % cluster_name)
         if not self.opts.confirm:
             resp = raw_input("Stop cluster %s (y/n)? " % cluster_name)
             if resp not in ['y', 'Y', 'yes']:
                 log.info("Aborting...")
                 continue
         cl.stop_cluster(self.opts.terminate_unstoppable,
                         force=self.opts.force)
         log.warn("All non-spot, EBS-backed nodes are now in a "
                  "'stopped' state")
         log.warn("You can restart this cluster by passing -x "
                  "to the 'start' command")
         log.warn("Use the 'terminate' command to *completely* "
                  "terminate this cluster")
Beispiel #3
0
 def _mkdir(self, directory, makedirs=False):
     if not os.path.isdir(directory):
         if os.path.isfile(directory):
             raise exception.BaseException("'%s' is a file not a directory")
         try:
             if makedirs:
                 os.makedirs(directory)
                 log.info("Created directories %s" % directory)
             else:
                 os.mkdir(directory)
                 log.info("Created single directory %s" % directory)
         except IOError, e:
             raise exception.BaseException(str(e))
 def get(self, remotepaths, localpath=''):
     """
     Copies one or more files from the remote host to the local host.
     """
     remotepaths = self._make_list(remotepaths)
     localpath = localpath or os.getcwd()
     globs = []
     noglobs = []
     for rpath in remotepaths:
         if glob.has_magic(rpath):
             globs.append(rpath)
         else:
             noglobs.append(rpath)
     globresults = [self.glob(g) for g in globs]
     remotepaths = noglobs
     for globresult in globresults:
         remotepaths.extend(globresult)
     recursive = False
     for rpath in remotepaths:
         if not self.path_exists(rpath):
             raise exception.BaseException(
                 "Remote file or directory does not exist: %s" % rpath)
     for rpath in remotepaths:
         if self.isdir(rpath):
             recursive = True
             break
     try:
         self.scp.get(remotepaths, local_path=localpath,
                      recursive=recursive)
     except Exception, e:
         log.debug("get failed: remotepaths=%s, localpath=%s",
                   str(remotepaths), localpath)
         raise exception.SCPException(str(e))
Beispiel #5
0
 def _validate_dir(self, dirname, msg_prefix=""):
     if not os.path.isdir(dirname):
         msg = "'%s' is not a directory"
         if not os.path.exists(dirname):
             msg = "'%s' does not exist"
         if msg_prefix:
             msg = ' '.join([msg_prefix, msg])
         msg = msg % dirname
         raise exception.BaseException(msg)
Beispiel #6
0
 def execute(self, args):
     if len(args) < 3:
         self.parser.error("please specify a cluster, local files or " +
                           "directories, and a remote destination path")
     ctag = args[0]
     rpath = args[-1]
     lpaths = args[1:-1]
     for lpath in lpaths:
         if not os.path.exists(lpath):
             raise exception.BaseException(
                 "Local file or directory does not exist: %s" % lpath)
     cl = self.cm.get_cluster(ctag, load_receipt=False)
     node = cl.get_node(self.opts.node)
     if self.opts.user:
         node.ssh.switch_user(self.opts.user)
     if len(lpaths) > 1 and not node.ssh.isdir(rpath):
         raise exception.BaseException("Remote path does not exist: %s" %
                                       rpath)
     node.ssh.put(lpaths, rpath)
Beispiel #7
0
 def write_stats_to_csv(self, filename):
     """
     Write important SGE stats to CSV file
     Appends one line to the CSV
     """
     bits = self.get_all_stats()
     try:
         f = open(filename, 'a')
         flat = ','.join(str(n) for n in bits) + '\n'
         f.write(flat)
         f.close()
     except IOError, e:
         raise exception.BaseException(str(e))
Beispiel #8
0
 def visualizer(self):
     if not self._visualizer:
         try:
             from tethyscluster.balancers.sge import visualizer
         except ImportError, e:
             log.error("Error importing visualizer:")
             log.error(str(e))
             log.error("check that matplotlib and numpy are installed and:")
             log.error("   $ python -c 'import matplotlib'")
             log.error("   $ python -c 'import numpy'")
             log.error("completes without error")
             raise exception.BaseException(
                 "Failed to load stats visualizer")
         self._visualizer = visualizer.SGEVisualizer(self.stats_file,
                                                     self.plot_output_dir)
Beispiel #9
0
 def execute(self, args):
     if len(args) < 3:
         self.parser.error("please specify a cluster, remote file or " +
                           "directory, and a local destination path")
     ctag = args[0]
     lpath = args[-1]
     rpaths = args[1:-1]
     cl = self.cm.get_cluster(ctag, load_receipt=False)
     node = cl.get_node(self.opts.node)
     if self.opts.user:
         node.ssh.switch_user(self.opts.user)
     for rpath in rpaths:
         if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath):
             raise exception.BaseException(
                 "Remote file or directory does not exist: %s" % rpath)
     node.ssh.get(rpaths, lpath)
Beispiel #10
0
 def slots_per_host(self):
     """
     Returns the number of slots per host. If for some reason the cluster is
     inconsistent, this will return -1 for example, if you have m1.large and
     m1.small in the same cluster
     """
     total = self.count_total_slots()
     if total == 0:
         return total
     single = 0
     for q in self.queues:
         if q.startswith('all.q@'):
             single = self.queues.get(q).get('slots')
             break
     if (total != (single * len(self.hosts))):
         raise exception.BaseException(
             "ERROR: Number of slots not consistent across cluster")
     return single
Beispiel #11
0
 def get_stats(self):
     """
     This method will ssh to the SGE master and get load & queue stats. It
     will feed these stats to SGEStats, which parses the XML. It will return
     two arrays: one of hosts, each host has a hash with its host
     information inside. The job array contains a hash for every job,
     containing statistics about the job name, priority, etc.
     """
     log.debug("starting get_stats")
     retries = 5
     for i in range(retries):
         try:
             return self._get_stats()
         except Exception:
             log.warn("Failed to retrieve stats (%d/%d):" %
                      (i + 1, retries), exc_info=True)
             log.warn("Retrying in %ds" % self.polling_interval)
             time.sleep(self.polling_interval)
     raise exception.BaseException(
         "Failed to retrieve SGE stats after trying %d times, exiting..." %
         retries)
    def _setup_cluster_user(self, user=None):
        """
        Create cluster user on all TethysCluster nodes

        This command takes care to examine existing folders in /home
        and set the new cluster_user's uid/gid accordingly. This is necessary
        for the case of EBS volumes containing /home with large amounts of data
        in them. It's much less expensive in this case to set the uid/gid of
        the new user to be the existing uid/gid of the dir in EBS rather than
        chowning potentially terabytes of data.
        """
        user = user or self._user
        uid, gid = self._get_new_user_id(user)
        if uid == 0 or gid == 0:
            raise exception.BaseException(
                "Cannot create user: {0:s} (uid: {1:1d}, gid: {2:1d}). This "
                "is caused by /home/{0:s} directory being owned by root. To "
                "fix this you'll need to create a new AMI. Note that the "
                "instance is still up.".format(user, uid, gid))
        log.info("Creating cluster user: %s (uid: %d, gid: %d)" %
                 (user, uid, gid))
        self._add_user_to_nodes(uid, gid, self._nodes)
Beispiel #13
0
 def run(self, cluster):
     """
     This function will loop indefinitely, using SGELoadBalancer.get_stats()
     to get the clusters status. It looks at the job queue and tries to
     decide whether to add or remove a node.  It should later look at job
     durations (currently doesn't)
     """
     self._cluster = cluster
     if self.max_nodes is None:
         self.max_nodes = cluster.cluster_size
     if self.min_nodes is None:
         self.min_nodes = 1
     if self.kill_cluster:
         self.min_nodes = 0
     if self.min_nodes > self.max_nodes:
         raise exception.BaseException(
             "min_nodes cannot be greater than max_nodes")
     use_default_stats_file = self.dump_stats and not self.stats_file
     use_default_plots_dir = self.plot_stats and not self.plot_output_dir
     if use_default_stats_file or use_default_plots_dir:
         self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True)
     if not self.stats_file:
         self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag
     if not self.plot_output_dir:
         self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag
     if not cluster.is_cluster_up():
         raise exception.ClusterNotRunning(cluster.cluster_tag)
     if self.dump_stats:
         if os.path.isdir(self.stats_file):
             raise exception.BaseException("stats file destination '%s' is"
                                           " a directory" % self.stats_file)
         sfdir = os.path.dirname(os.path.abspath(self.stats_file))
         self._validate_dir(sfdir, msg_prefix="stats file destination")
     if self.plot_stats:
         if os.path.isfile(self.plot_output_dir):
             raise exception.BaseException("plot output destination '%s' "
                                           "is a file" %
                                           self.plot_output_dir)
         self._validate_dir(self.plot_output_dir,
                            msg_prefix="plot output destination")
     raw = dict(__raw__=True)
     log.info("Starting load balancer (Use ctrl-c to exit)")
     log.info("Maximum cluster size: %d" % self.max_nodes,
              extra=raw)
     log.info("Minimum cluster size: %d" % self.min_nodes,
              extra=raw)
     log.info("Cluster growth rate: %d nodes/iteration\n" %
              self.add_nodes_per_iteration, extra=raw)
     if self.dump_stats:
         log.info("Writing stats to file: %s" % self.stats_file)
     if self.plot_stats:
         log.info("Plotting stats to directory: %s" % self.plot_output_dir)
     while(self._keep_polling):
         if not cluster.is_cluster_up():
             log.info("Waiting for all nodes to come up...")
             time.sleep(self.polling_interval)
             continue
         self.get_stats()
         log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw)
         log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()),
                  extra=raw)
         oldest_queued_job_age = self.stat.oldest_queued_job_age()
         if oldest_queued_job_age:
             log.info("Oldest queued job: %s" % oldest_queued_job_age,
                      extra=raw)
         log.info("Avg job duration: %d secs" %
                  self.stat.avg_job_duration(), extra=raw)
         log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(),
                  extra=raw)
         log.info("Last cluster modification time: %s" %
                  self.__last_cluster_mod_time.strftime("%Y-%m-%d %X%z"),
                  extra=dict(__raw__=True))
         # evaluate if nodes need to be added
         self._eval_add_node()
         # evaluate if nodes need to be removed
         self._eval_remove_node()
         if self.dump_stats or self.plot_stats:
             self.stat.write_stats_to_csv(self.stats_file)
         # call the visualizer
         if self.plot_stats:
             try:
                 self.visualizer.graph_all()
             except IOError, e:
                 raise exception.BaseException(str(e))
         # evaluate if cluster should be terminated
         if self.kill_cluster:
             if self._eval_terminate_cluster():
                 log.info("Terminating cluster and exiting...")
                 return self._cluster.terminate_cluster()
         log.info("Sleeping...(looping again in %d secs)\n" %
                  self.polling_interval)
         time.sleep(self.polling_interval)