Example #1
0
 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     num_nodes = len(self._cluster.nodes)
     if num_nodes <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)"
                  % self.min_nodes)
         return
     max_remove = num_nodes - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = utils.get_utc_now()
         except Exception:
             log.error("Failed to remove node %s" % node.alias,
                       exc_info=True)
Example #2
0
 def _eval_add_node(self):
     """
     This function inspects the current state of the SGE queue and decides
     whether or not to add nodes to the cluster. Returns the number of nodes
     to add.
     """
     num_nodes = len(self._cluster.nodes)
     if num_nodes >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     queued_jobs = self.stat.get_queued_jobs()
     if not queued_jobs and num_nodes >= self.min_nodes:
         log.info("Not adding nodes: at or above minimum nodes "
                  "and no queued jobs...")
         return
     total_slots = self.stat.count_total_slots()
     if not self.has_cluster_stabilized() and total_slots > 0:
         return
     running_jobs = self.stat.get_running_jobs()
     used_slots = sum([int(j['slots']) for j in running_jobs])
     qw_slots = sum([int(j['slots']) for j in queued_jobs])
     slots_per_host = self.stat.slots_per_host()
     avail_slots = total_slots - used_slots
     need_to_add = 0
     if num_nodes < self.min_nodes:
         log.info("Adding node: below minimum (%d)" % self.min_nodes)
         need_to_add = self.min_nodes - num_nodes
     elif total_slots == 0:
         # no slots, add one now
         need_to_add = 1
     elif qw_slots > avail_slots:
         log.info("Queued jobs need more slots (%d) than available (%d)" %
                  (qw_slots, avail_slots))
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d seconds "
                      "longer than max: %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             if slots_per_host != 0:
                 need_to_add = qw_slots / slots_per_host
             else:
                 need_to_add = 1
         else:
             log.info("No queued jobs older than %d seconds" %
                      self.longest_allowed_queue_time)
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(utils.get_utc_now())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = utils.get_utc_now()
             log.info("Done adding nodes at %s" %
                      str(self.__last_cluster_mod_time))
         except Exception:
             log.error("Failed to add new host", exc_info=True)
Example #3
0
 def setup_tmuxcc(self,
                  client=None,
                  nodes=None,
                  user='******',
                  layout='tiled'):
     log.info("Creating TMUX Control Center for user '%s'" % user)
     client = client or self._master
     nodes = nodes or self._nodes
     envname = self._envname
     orig_user = client.ssh._username
     if orig_user != user:
         client.ssh.connect(username=user)
     chunks = [chunk for chunk in utils.chunk_list(nodes, items=8)]
     num_windows = len(chunks) + len(nodes)
     if len(nodes) == 0:
         log.error("Cluster has no nodes, exiting...")
         return
     self.create_session(client, envname, num_windows=num_windows)
     if len(nodes) == 1 and client == nodes[0]:
         return
     if not self._supports_layout(client, envname, layout, window=0):
         log.warn("failed to select layout '%s', defaulting to "
                  "'main-vertical'" % layout)
         layout = "main-vertical"
         status = self._select_layout(client, envname, layout, window=0)
         if status != 0:
             raise exception.PluginError("failed to set a layout")
     for i, chunk in enumerate(chunks):
         self._rename_window(client, envname, i, 'all%s' % i)
         for j, node in enumerate(chunk):
             if j != 0:
                 self._split_window(client, envname, i)
             self._select_layout(client, envname, window=i, layout=layout)
             if node.alias != client.alias:
                 self._send_keys(client,
                                 envname,
                                 cmd='ssh %s' % node.alias,
                                 window="%d.%d" % (i, j))
     for i, node in enumerate(nodes):
         window = i + len(chunks)
         self._rename_window(client, envname, window, node.alias)
         if node.alias != client.alias:
             self._send_keys(client,
                             envname,
                             cmd='ssh %s' % node.alias,
                             window=window)
     self._select_window(client, envname, window=0)
     self._select_pane(client, envname, window=0, pane=0)
     if orig_user != user:
         client.ssh.connect(username=orig_user)
Example #4
0
 def get_settings_from_env(self, settings):
     """
     Returns AWS credentials defined in the user's shell
     environment.
     """
     found = {}
     for key in settings:
         if key.upper() in os.environ:
             log.warn("Setting '%s' from environment..." % key.upper())
             found[key] = os.environ.get(key.upper())
         elif key in os.environ:
             log.warn("Setting '%s' from environment..." % key)
             found[key] = os.environ.get(key)
     return found
Example #5
0
 def execute(self, args):
     if not args:
         cls = [
             c.cluster_tag for c in self.cm.get_clusters(load_plugins=False,
                                                         load_receipt=False)
         ]
         msg = "please specify a cluster"
         if cls:
             opts = ', '.join(cls)
             msg = " ".join([msg, '(options:', opts, ')'])
         self.parser.error(msg)
     for cluster_name in args:
         try:
             cl = self.cm.get_cluster(cluster_name)
         except exception.ClusterDoesNotExist:
             raise
         except Exception, e:
             log.debug("Failed to load cluster settings!", exc_info=True)
             log.error("Failed to load cluster settings!")
             if self.opts.force:
                 log.warn("Ignoring cluster settings due to --force option")
                 cl = self.cm.get_cluster(cluster_name,
                                          load_receipt=False,
                                          require_keys=False)
             else:
                 if not isinstance(e, exception.IncompatibleCluster):
                     log.error("Use -f to forcefully stop the cluster")
                 raise
         is_stoppable = cl.is_stoppable()
         if not is_stoppable:
             has_stoppable_nodes = cl.has_stoppable_nodes()
             if not self.opts.terminate_unstoppable and has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' contains 'stoppable' and 'unstoppable' "
                     "nodes. Your options are:\n\n"
                     "1. Use the --terminate-unstoppable option to "
                     "stop all 'stoppable' nodes and terminate all "
                     "'unstoppable' nodes\n\n"
                     "2. Use the 'terminate' command to destroy the "
                     "cluster.\n\nPass --help for more info." %
                     cluster_name)
             if not has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' does not contain any 'stoppable' nodes "
                     "and can only be terminated. Please use the "
                     "'terminate' command instead to destroy the cluster."
                     "\n\nPass --help for more info" % cluster_name)
         if not self.opts.confirm:
             resp = raw_input("Stop cluster %s (y/n)? " % cluster_name)
             if resp not in ['y', 'Y', 'yes']:
                 log.info("Aborting...")
                 continue
         cl.stop_cluster(self.opts.terminate_unstoppable,
                         force=self.opts.force)
         log.warn("All non-spot, EBS-backed nodes are now in a "
                  "'stopped' state")
         log.warn("You can restart this cluster by passing -x "
                  "to the 'start' command")
         log.warn("Use the 'terminate' command to *completely* "
                  "terminate this cluster")
Example #6
0
 def run(self, nodes, master, user, shell, volumes):
     mssh = master.ssh
     mssh.switch_user(user)
     botocfg = '/home/%s/.boto' % user
     if not mssh.path_exists(botocfg):
         log.info("Installing AWS credentials for user: %s" % user)
         if self.boto_cfg:
             log.info("Copying %s to %s" % (self.boto_cfg, botocfg))
             mssh.put(self.boto_cfg, botocfg)
         else:
             log.info("Installing current credentials to: %s" % botocfg)
             f = mssh.remote_file(botocfg, 'w')
             f.write(BOTO_CFG_TEMPLATE % master.ec2.__dict__)
             f.close()
         mssh.chmod(0400, botocfg)
     else:
         log.warn("AWS credentials already present - skipping install")
Example #7
0
 def terminate(self, cluster_name, force=False):
     if force:
         log.warn("Ignoring cluster settings due to --force option")
     try:
         cl = self.cm.get_cluster(cluster_name,
                                  load_receipt=not force,
                                  require_keys=not force)
         if force:
             self._terminate_manually(cl)
         else:
             self._terminate_cluster(cl)
     except exception.ClusterDoesNotExist:
         raise
     except Exception:
         log.error("Failed to terminate cluster!", exc_info=True)
         if not force:
             log.error("Use -f to forcefully terminate the cluster")
         raise
Example #8
0
 def _start_notebook(self, master, user, profile_dir):
     log.info("Setting up IPython web notebook for user: %s" % user)
     user_cert = posixpath.join(profile_dir, '%s.pem' % user)
     ssl_cert = posixpath.join(profile_dir, '%s.pem' % user)
     if not master.ssh.isfile(user_cert):
         log.info("Creating SSL certificate for user %s" % user)
         ssl_subj = "/C=US/ST=SC/L=STAR/O=Dis/CN=%s" % master.dns_name
         master.ssh.execute("openssl req -new -newkey rsa:4096 -days 365 "
                            '-nodes -x509 -subj %s -keyout %s -out %s' %
                            (ssl_subj, ssl_cert, ssl_cert))
     else:
         log.info("Using existing SSL certificate...")
     f = master.ssh.remote_file('%s/ipython_notebook_config.py' %
                                profile_dir)
     notebook_port = 8888
     sha1py = 'from IPython.lib import passwd; print passwd("%s")'
     sha1cmd = "python -c '%s'" % sha1py
     sha1pass = master.ssh.execute(sha1cmd % self.notebook_passwd)[0]
     f.write('\n'.join([
         "c = get_config()",
         "c.IPKernelApp.pylab = 'inline'",
         "c.NotebookApp.certfile = u'%s'" % ssl_cert,
         "c.NotebookApp.ip = '*'",
         "c.NotebookApp.open_browser = False",
         "c.NotebookApp.password = u'%s'" % sha1pass,
         "c.NotebookApp.port = %d" % notebook_port,
     ]))
     f.close()
     if self.notebook_directory is not None:
         if not master.ssh.path_exists(self.notebook_directory):
             master.ssh.makedirs(self.notebook_directory)
         master.ssh.execute_async(
             "ipython notebook --no-browser --notebook-dir='%s'" %
             self.notebook_directory)
     else:
         master.ssh.execute_async("ipython notebook --no-browser")
     self._authorize_port(master, notebook_port, 'notebook')
     log.info("IPython notebook URL: https://%s:%s" %
              (master.dns_name, notebook_port))
     log.info("The notebook password is: %s" % self.notebook_passwd)
     log.warn(
         "Please check your local firewall settings if you're having "
         "issues connecting to the IPython notebook",
         extra=dict(__textwrap__=True))
Example #9
0
 def _add_to_known_hosts(self, node):
     log.info("Configuring local known_hosts file")
     user_home = os.path.expanduser('~')
     khosts = os.path.join(user_home, '.ssh', 'known_hosts')
     if not os.path.isfile(khosts):
         log.warn("Unable to configure known_hosts: file does not exist")
         return
     contents = open(khosts).read()
     if node.dns_name not in contents:
         server_pkey = node.ssh.get_server_public_key()
         khostsf = open(khosts, 'a')
         if contents[-1] != '\n':
             khostsf.write('\n')
         name_entry = '%s,%s' % (node.dns_name, node.ip_address)
         khostsf.write(' '.join([
             name_entry,
             server_pkey.get_name(),
             base64.b64encode(str(server_pkey)), '\n'
         ]))
         khostsf.close()
Example #10
0
 def get_stats(self):
     """
     This method will ssh to the SGE master and get load & queue stats. It
     will feed these stats to SGEStats, which parses the XML. It will return
     two arrays: one of hosts, each host has a hash with its host
     information inside. The job array contains a hash for every job,
     containing statistics about the job name, priority, etc.
     """
     log.debug("starting get_stats")
     retries = 5
     for i in range(retries):
         try:
             return self._get_stats()
         except Exception:
             log.warn("Failed to retrieve stats (%d/%d):" %
                      (i + 1, retries), exc_info=True)
             log.warn("Retrying in %ds" % self.polling_interval)
             time.sleep(self.polling_interval)
     raise exception.BaseException(
         "Failed to retrieve SGE stats after trying %d times, exiting..." %
         retries)
Example #11
0
 def _create_image_from_ebs(self, size=15):
     log.info("Creating new EBS AMI...")
     imgid = self.ec2.create_image(self.host.id, self.name,
                                   self.description)
     img = self.ec2.get_image(imgid)
     log.info("New EBS AMI created: %s" % imgid)
     root_dev = self.host.root_device_name
     if root_dev in self.host.block_device_mapping:
         log.info("Fetching block device mapping for %s" % imgid,
                  extra=dict(__nonewline__=True))
         s = Spinner()
         try:
             s.start()
             while root_dev not in img.block_device_mapping:
                 img = self.ec2.get_image(imgid)
                 time.sleep(5)
         finally:
             s.stop()
         snapshot_id = img.block_device_mapping[root_dev].snapshot_id
         snap = self.ec2.get_snapshot(snapshot_id)
         self.ec2.wait_for_snapshot(snap)
     else:
         log.warn("Unable to find root device - cant wait for snapshot")
     log.info("Waiting for %s to become available..." % imgid,
              extra=dict(__nonewline__=True))
     s = Spinner()
     try:
         s.start()
         while img.state == "pending":
             time.sleep(15)
             if img.update() == "failed":
                 raise exception.AWSError(
                     "EBS image creation failed for %s" % imgid)
     finally:
         s.stop()
     return imgid
Example #12
0
 def warn_experimental(self, msg, num_secs=10):
     """
     Warn user that an experimental feature is being used
     Counts down from num_secs before continuing
     """
     sep = '*' * 60
     log.warn('\n'.join([sep, msg, sep]), extra=dict(__textwrap__=True))
     r = range(1, num_secs + 1)
     r.reverse()
     print
     log.warn("Waiting %d seconds before continuing..." % num_secs)
     log.warn("Press CTRL-C to cancel...")
     for i in r:
         sys.stdout.write('%d...' % i)
         sys.stdout.flush()
         time.sleep(1)
     print
Example #13
0
def warn_debug_file_moved():
    old_file = os.path.join(static.TMP_DIR, 'tethyscluster-debug-%s.log' %
                            static.CURRENT_USER)
    if os.path.exists(old_file):
        stars = '*' * 50
        log.warn(stars)
        log.warn("The default log file location is now:")
        log.warn("")
        log.warn(static.DEBUG_FILE)
        log.warn("")
        log.warn("Please delete or move the old log file located at:")
        log.warn("")
        log.warn(old_file)
        log.warn(stars)
Example #14
0
 def run(self, nodes, master, user, user_shell, volumes):
     log.info("Installing mysql-cluster-server on all nodes...")
     for node in nodes:
         self.pool.simple_job(self._install_mysql_cluster, (node),
                              jobid=node.alias)
     self.pool.wait(len(nodes))
     mconn = master.ssh
     mconn.execute('rm -f /usr/mysql-cluster/*')
     # Get IPs for all nodes
     self.mgm_ip = master.private_ip_address
     if not self._dedicated_query:
         self.storage_ips = [x.private_ip_address for x in nodes[1:]]
         self.query_ips = self.storage_ips
         self.data_nodes = nodes[1:]
         self.query_nodes = nodes
     else:
         self.data_nodes = nodes[1:self._num_data_nodes + 1]
         self.query_nodes = nodes[self._num_data_nodes + 1:]
         self.query_nodes.append(master)
         self.storage_ips = [x.private_ip_address for x in self.data_nodes]
         self.query_ips = [x.private_ip_address for x in self.query_nodes]
     # Create backup dir and change ownership of mysql-cluster dir
     log.info('Backing up and stopping all mysql processes on all nodes')
     for node in nodes:
         self.pool.simple_job(self._backup_and_reset, (node),
                              jobid=node.alias)
     self.pool.wait(len(nodes))
     # Generate and place ndb_mgmd configuration file
     log.info('Generating ndb_mgmd.cnf...')
     ndb_mgmd = mconn.remote_file('/etc/mysql/ndb_mgmd.cnf')
     ndb_mgmd.write(self.generate_ndb_mgmd())
     ndb_mgmd.close()
     # Generate and place my.cnf configuration file on each data node
     log.info('Generating my.cnf on all nodes')
     for node in nodes:
         self.pool.simple_job(self._write_my_cnf, (node), jobid=node.alias)
     self.pool.wait(len(nodes))
     # Restart mysql-ndb-mgm on master
     log.info('Restarting mysql-ndb-mgm on master node...')
     mconn.execute('/etc/init.d/mysql-ndb-mgm restart')
     # Start mysqld-ndb on data nodes
     log.info('Restarting mysql-ndb on all data nodes...')
     for node in self.data_nodes:
         self.pool.simple_job(node.ssh.execute,
                              ('/etc/init.d/mysql-ndb restart'),
                              jobid=node.alias)
     self.pool.wait(len(self.data_nodes))
     # Start mysql on query nodes
     log.info('Starting mysql on all query nodes')
     for node in self.query_nodes:
         self.pool.simple_job(node.ssh.execute,
                              ('/etc/init.d/mysql restart'),
                              dict(ignore_exit_status=True),
                              jobid=node.alias)
     self.pool.wait(len(self.query_nodes))
     # Import sql dump
     dump_file = self._dump_file
     dump_dir = '/mnt/mysql-cluster-backup'
     if posixpath.isabs(self._dump_file):
         dump_dir, dump_file = posixpath.split(self._dump_file)
     else:
         log.warn("%s is not an absolute path, defaulting to %s" %
                  (self._dump_file, posixpath.join(dump_dir, dump_file)))
     name, ext = posixpath.splitext(dump_file)
     sc_path = posixpath.join(dump_dir, name + '.sc' + ext)
     orig_path = posixpath.join(dump_dir, dump_file)
     if not mconn.isdir(dump_dir):
         log.info("Directory %s does not exist, creating..." % dump_dir)
         mconn.makedirs(dump_dir)
     if mconn.isfile(sc_path):
         mconn.execute('mysql < %s' % sc_path)
     elif mconn.isfile(orig_path):
         mconn.execute('mysql < %s' % orig_path)
     else:
         log.info('No dump file found, not importing.')
     log.info('Adding MySQL dump cronjob to master node')
     cronjob = self.generate_mysqldump_crontab(sc_path)
     mconn.remove_lines_from_file('/etc/crontab', '#tethyscluster-mysql')
     crontab_file = mconn.remote_file('/etc/crontab', 'a')
     crontab_file.write(cronjob)
     crontab_file.close()
     log.info('Management Node: %s' % master.alias)
     log.info('Data Nodes: \n%s' %
              '\n'.join([x.alias for x in self.data_nodes]))
     log.info('Query Nodes: \n%s' %
              '\n'.join([x.alias for x in self.query_nodes]))
 def _setup_ebs_volumes(self):
     """
     Mount EBS volumes, if specified in ~/.tethyscluster/config to /home
     """
     # setup /etc/fstab on master to use block device if specified
     master = self._master
     devices = master.get_device_map()
     for vol in self._volumes:
         vol = self._volumes[vol]
         vol_id = vol.get("volume_id")
         mount_path = vol.get('mount_path')
         device = vol.get("device")
         volume_partition = vol.get('partition')
         if not (vol_id and device and mount_path):
             log.error("missing required settings for vol %s" % vol)
             continue
         if device not in devices and device.startswith('/dev/sd'):
             # check for "correct" device in unpatched kernels
             device = device.replace('/dev/sd', '/dev/xvd')
             if device not in devices:
                 log.warn("Cannot find device %s for volume %s" %
                          (device, vol_id))
                 log.warn("Not mounting %s on %s" % (vol_id, mount_path))
                 log.warn("This usually means there was a problem "
                          "attaching the EBS volume to the master node")
                 continue
         partitions = master.get_partition_map(device=device)
         if not volume_partition:
             if len(partitions) == 0:
                 volume_partition = device
             elif len(partitions) == 1:
                 volume_partition = partitions.popitem()[0]
             else:
                 log.error(
                     "volume has more than one partition, please specify "
                     "which partition to use (e.g. partition=0, "
                     "partition=1, etc.) in the volume's config")
                 continue
         elif volume_partition not in partitions:
             log.warn("Cannot find partition %s on volume %s" %
                      (volume_partition, vol_id))
             log.warn("Not mounting %s on %s" % (vol_id, mount_path))
             log.warn("This either means that the volume has not "
                      "been partitioned or that the partition "
                      "specified does not exist on the volume")
             continue
         log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
         mount_map = master.get_mount_map()
         if volume_partition in mount_map:
             path, fstype, options = mount_map.get(volume_partition)
             if path != mount_path:
                 log.error("Volume %s is mounted on %s, not on %s" %
                           (vol_id, path, mount_path))
             else:
                 log.info("Volume %s already mounted on %s...skipping" %
                          (vol_id, mount_path))
             continue
         master.mount_device(volume_partition, mount_path)