Ejemplo n.º 1
0
 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     num_nodes = len(self._cluster.nodes)
     if num_nodes <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)" %
                  self.min_nodes)
         return
     max_remove = num_nodes - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = datetime.datetime.utcnow()
         except Exception:
             log.error("Failed to remove node %s" % node.alias,
                       exc_info=True)
Ejemplo n.º 2
0
 def _create_image_from_ebs(self, size=15):
     log.info("Creating new EBS AMI...")
     imgid = self.ec2.create_image(self.host.id, self.name, self.description)
     img = self.ec2.get_image(imgid)
     log.info("New EBS AMI created: %s" % imgid)
     root_dev = self.host.root_device_name
     if root_dev in self.host.block_device_mapping:
         log.info("Fetching block device mapping for %s" % imgid, extra=dict(__nonewline__=True))
         s = Spinner()
         try:
             s.start()
             while root_dev not in img.block_device_mapping:
                 img = self.ec2.get_image(imgid)
                 time.sleep(5)
         finally:
             s.stop()
         snapshot_id = img.block_device_mapping[root_dev].snapshot_id
         snap = self.ec2.get_snapshot(snapshot_id)
         self.ec2.wait_for_snapshot(snap)
     else:
         log.warn("Unable to find root device - cant wait for snapshot")
     log.info("Waiting for %s to become available..." % imgid, extra=dict(__nonewline__=True))
     s = Spinner()
     try:
         s.start()
         while img.state == "pending":
             time.sleep(15)
             if img.update() == "failed":
                 raise exception.AWSError("EBS image creation failed for %s" % imgid)
     finally:
         s.stop()
     return imgid
Ejemplo n.º 3
0
    def copy_remote_file_to_nodes(self, remote_file, nodes, dest=None):
        """
        Copies a remote file from this Node instance to another Node instance
        without passwordless ssh between the two.

        dest - path to store the data in on the node (defaults to remote_file)
        """
        if not dest:
            dest = remote_file
        rf = self.ssh.remote_file(remote_file, 'r')
        contents = rf.read()
        sts = rf.stat()
        mode = stat.S_IMODE(sts.st_mode)
        uid = sts.st_uid
        gid = sts.st_gid
        rf.close()
        for node in nodes:
            if self.id == node.id and remote_file == dest:
                log.warn("src and destination are the same: %s, skipping" %
                         remote_file)
                continue
            nrf = node.ssh.remote_file(dest, 'w')
            nrf.write(contents)
            nrf.chown(uid, gid)
            nrf.chmod(mode)
            nrf.close()
Ejemplo n.º 4
0
 def execute(self, args):
     if not args:
         self.parser.error("please specify a cluster")
     for cluster_name in args:
         cl = self.cm.get_cluster(cluster_name)
         is_ebs = cl.is_ebs_cluster()
         if not self.opts.confirm:
             action = "Terminate"
             if is_ebs:
                 action = "Stop EBS"
                 if cl.spot_bid:
                     action = "Terminate Spot EBS"
             resp = raw_input("%s cluster %s (y/n)? " %
                              (action, cluster_name))
             if resp not in ['y', 'Y', 'yes']:
                 log.info("Aborting...")
                 continue
         cl.stop_cluster()
         if is_ebs and cl._nodes:
             log.warn(("All EBS-backed nodes in '%s' are now in a " + \
                       "'stopped' state") % cluster_name)
             log.warn("You can restart this cluster by passing -x " + \
                      "to the 'start' command")
             log.warn("Use the 'terminate' command to *completely* " + \
                      "terminate this cluster")
             log.warn("NOTE: Unless EBS-backed nodes are in a " + \
                      "'running' or 'terminated'")
             log.warn("state, you are charged for the EBS volumes " + \
                      "backing the nodes.")
Ejemplo n.º 5
0
 def _eval_add_node(self):
     """
     This function inspects the current state of the SGE queue and decides
     whether or not to add nodes to the cluster. Returns the number of nodes
     to add.
     """
     num_nodes = len(self._cluster.nodes)
     if num_nodes >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     queued_jobs = self.stat.get_queued_jobs()
     if not queued_jobs and num_nodes >= self.min_nodes:
         log.info("Not adding nodes: at or above minimum nodes "
                  "and no queued jobs...")
         return
     total_slots = self.stat.count_total_slots()
     if not self.has_cluster_stabilized() and total_slots > 0:
         return
     running_jobs = self.stat.get_running_jobs()
     used_slots = sum([int(j['slots']) for j in running_jobs])
     qw_slots = sum([int(j['slots']) for j in queued_jobs])
     slots_per_host = self.stat.slots_per_host()
     avail_slots = total_slots - used_slots
     need_to_add = 0
     if num_nodes < self.min_nodes:
         log.info("Adding node: below minimum (%d)" % self.min_nodes)
         need_to_add = self.min_nodes - num_nodes
     elif total_slots == 0:
         # no slots, add one now
         need_to_add = 1
     elif qw_slots > avail_slots:
         log.info("Queued jobs need more slots (%d) than available (%d)" %
                  (qw_slots, avail_slots))
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d seconds "
                      "longer than max: %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             if slots_per_host != 0:
                 need_to_add = qw_slots / slots_per_host
             else:
                 need_to_add = 1
         else:
             log.info("No queued jobs older than %d seconds" %
                      self.longest_allowed_queue_time)
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(utils.get_utc_now())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = utils.get_utc_now()
             log.info("Done adding nodes at %s" %
                      str(self.__last_cluster_mod_time))
         except Exception:
             log.error("Failed to add new host", exc_info=True)
Ejemplo n.º 6
0
    def run(self, nodes, master, user, user_shell, volumes):
        self._master = master
        self._new_security_group = master.cluster_groups[0].id


        log.info("Configuring RAID")

        # do a suitable check for lvm2
        needs_lvm2 = True
        if needs_lvm2:
            try:
                node.ssh.execute("echo 'APT::Periodic::Enable \"0\";' >> /etc/apt/apt.conf.d/10periodic")
            except Exception, e:
                print e
                log.warn(e)

            # Ubuntu 16 has a very stupid new default
            # https://github.com/geerlingguy/packer-ubuntu-1604/issues/3#issue-154560190
            try:
                log.info("killing any running apt-get")
                node.ssh.execute("killall apt-get")
                node.ssh.execute("dpkg --configure -a")
                node.ssh.execute("apt-get update")
                node.ssh.execute("apt-get upgrade")
                log.info("clean kill")
            except Exception, e:
                log.info("not a clean kill")
                print e
                log.warn(e)
Ejemplo n.º 7
0
 def load(self):
     """
     Populate this config object from the StarCluster config
     """
     log.debug('Loading config')
     try:
         self.globals = self._load_section('global', self.global_settings)
     except exception.ConfigSectionMissing:
         pass
     try:
         self.aws = self._load_section('aws info', self.aws_settings)
     except exception.ConfigSectionMissing:
         log.warn("No [aws info] section found in the config!")
     self.aws.update(self.get_settings_from_env(self.aws_settings))
     self.keys = self._load_sections('key', self.key_settings)
     self.vols = self._load_sections('volume', self.volume_settings)
     self.vols.update(self._load_sections('vol', self.volume_settings))
     self.plugins = self._load_sections('plugin',
                                        self.plugin_settings,
                                        filter_settings=False)
     self.permissions = self._load_sections('permission',
                                            self.permission_settings)
     sections = self._get_sections('cluster')
     self.clusters = self._load_cluster_sections(sections)
     return self
Ejemplo n.º 8
0
    def copy_remote_file_to_nodes(self, remote_file, nodes, dest=None):
        """
        Copies a remote file from this Node instance to another Node instance
        without passwordless ssh between the two.

        dest - path to store the data in on the node (defaults to remote_file)
        """
        if not dest:
            dest = remote_file
        rf = self.ssh.remote_file(remote_file, 'r')
        sts = rf.stat()
        mode = stat.S_IMODE(sts.st_mode)
        uid = sts.st_uid
        gid = sts.st_gid
        rf.close()
        with tempfile.NamedTemporaryFile(
                prefix=os.path.basename(remote_file) + "_") as f:
            self.ssh.get(remote_file, f.name)
            for node in nodes:
                if self.id == node.id and remote_file == dest:
                    log.warn("src and destination are the same: %s, skipping" %
                             remote_file)
                    continue
                node.ssh.put(f.name, dest)
                nrf = node.ssh.remote_file(dest, 'a')
                nrf.chown(uid, gid)
                nrf.chmod(mode)
                nrf.close()
Ejemplo n.º 9
0
 def _eval_add_node(self):
     """
     This function inspects the current state of the SGE queue and decides
     whether or not to add nodes to the cluster. Returns the number of nodes
     to add.
     """
     num_nodes = len(self._cluster.nodes)
     if num_nodes >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     queued_jobs = self.stat.get_queued_jobs()
     if not queued_jobs and num_nodes >= self.min_nodes:
         log.info("Not adding nodes: at or above minimum nodes "
                  "and no queued jobs...")
         return
     total_slots = self.stat.count_total_slots()
     if not self.has_cluster_stabilized() and total_slots > 0:
         return
     running_jobs = self.stat.get_running_jobs()
     used_slots = sum([int(j['slots']) for j in running_jobs])
     qw_slots = sum([int(j['slots']) for j in queued_jobs])
     slots_per_host = self.stat.slots_per_host()
     avail_slots = total_slots - used_slots
     need_to_add = 0
     if num_nodes < self.min_nodes:
         log.info("Adding node: below minimum ({:})".format(self.min_nodes))
         need_to_add = self.min_nodes - num_nodes
     elif total_slots == 0:
         #no slots, add one now
         need_to_add = 1
     elif qw_slots > avail_slots:
         log.info("Queued jobs need more slots (%d) than available (%d)" %
                  (qw_slots, avail_slots))
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d seconds "
                      "longer than max: %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             if slots_per_host != 0:
                 need_to_add = qw_slots / slots_per_host
             else:
                 need_to_add = 1
         else:
             log.info("No queued jobs older than %d seconds" %
                      self.longest_allowed_queue_time)
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(datetime.datetime.utcnow())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = datetime.datetime.utcnow()
             log.info("Done adding nodes at %s" %
                      str(datetime.datetime.utcnow()))
         except Exception:
             log.error("Failed to add new host", exc_info=True)
Ejemplo n.º 10
0
    def copy_remote_file_to_nodes(self, remote_file, nodes, dest=None):
        """
        Copies a remote file from this Node instance to another Node instance
        without passwordless ssh between the two.

        dest - path to store the data in on the node (defaults to remote_file)
        """
        if not dest:
            dest = remote_file
        rf = self.ssh.remote_file(remote_file, 'r')
        sts = rf.stat()
        mode = stat.S_IMODE(sts.st_mode)
        uid = sts.st_uid
        gid = sts.st_gid
        rf.close()
        with tempfile.NamedTemporaryFile(
                prefix=os.path.basename(remote_file) + "_") as f:
            self.ssh.get(remote_file, f.name)
            for node in nodes:
                if self.id == node.id and remote_file == dest:
                    log.warn("src and destination are the same: %s, skipping" %
                             remote_file)
                    continue
                node.ssh.put(f.name, dest)
                nrf = node.ssh.remote_file(dest, 'a')
                nrf.chown(uid, gid)
                nrf.chmod(mode)
                nrf.close()
Ejemplo n.º 11
0
 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     num_nodes = len(self._cluster.nodes)
     if num_nodes <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)"
                  % self.min_nodes)
         return
     max_remove = num_nodes - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = utils.get_utc_now()
         except Exception:
             log.error("Failed to remove node %s" % node.alias,
                       exc_info=True)
Ejemplo n.º 12
0
 def _validate_zone(self, zone):
     z = self.ec2.get_zone_or_none(zone)
     if not z:
         raise exception.ValidationError(
             'zone %s does not exist' % zone)
     if z.state != 'available':
         log.warn('zone %s is not available at this time' % zone)
     return True
Ejemplo n.º 13
0
 def _get_ipcluster_plugin(self, node):
     ipyversion = self._get_ipy_version(node)
     if ipyversion < '0.11':
         if not ipyversion.startswith('0.10'):
             log.warn("Trying unsupported IPython version %s" % ipyversion)
         return IPCluster10()
     else:
         return IPCluster11(self.enable_notebook, self.notebook_passwd)
Ejemplo n.º 14
0
 def _get_ipcluster_plugin(self, node):
     ipyversion = self._get_ipy_version(node)
     if ipyversion < '0.11':
         if not ipyversion.startswith('0.10'):
             log.warn("Trying unsupported IPython version %s" % ipyversion)
         return IPCluster10()
     else:
         return IPCluster11(self.enable_notebook, self.notebook_passwd)
Ejemplo n.º 15
0
 def _validate_zone(self):
     availability_zone = self.availability_zone
     if availability_zone:
         zone = self.ec2.get_zone(availability_zone)
         if not zone:
             raise exception.ClusterValidationError("availability_zone = %s does not exist" % availability_zone)
         if zone.state != "available":
             log.warn("The availability_zone = %s " % zone + "is not available at this time")
     return True
Ejemplo n.º 16
0
 def _has_all_required_settings(self):
     has_all_required = True
     for opt in self.__cluster_settings:
         requirements = self.__cluster_settings[opt]
         name = opt
         required = requirements[1]
         if required and self.get(name.lower()) is None:
             log.warn("Missing required setting %s" % name)
             has_all_required = False
     return has_all_required
Ejemplo n.º 17
0
    def __init__(self,
                 host,
                 username = None,
                 password = None,
                 private_key = None,
                 private_key_pass = None,
                 port = 22,
                 timeout=30,
                 ):
        self._timeout = timeout
        self._sftp_live = False
        self._sftp = None
        if not username:
            username = os.environ['LOGNAME']

        # Log to a temporary file.
        templog = tempfile.mkstemp('.txt', 'ssh-')[1]
        paramiko.util.log_to_file(templog)

        # Begin the SSH transport.
        self._transport_live = False
        try:
            sock = self._get_socket(host, port)
            self._transport = paramiko.Transport(sock)
            self._transport.banner_timeout = self._timeout
        except socket.error:
            raise exception.SSHConnectionError(host, port)
        self._transport_live = True
        # Authenticate the transport.
        if password:
            # Using Password.
            try:
                self._transport.connect(username = username, password = password)
            except paramiko.AuthenticationException:
                raise exception.SSHAuthException(username,host)
        elif private_key:
            # Use Private Key.
            pkey = None
            log.debug('private key specified')
            if private_key.endswith('rsa') or private_key.count('rsa'):
                pkey = self._load_rsa_key(private_key, private_key_pass)
            elif private_key.endswith('dsa') or private_key.count('dsa'):
                pkey = self._load_dsa_key(private_key, private_key_pass)
            else:
                log.warn("specified key does not end in either rsa or dsa, trying both")
                pkey = self._load_rsa_key(private_key, private_key_pass)
                if pkey is None:
                    pkey = self._load_dsa_key(private_key, private_key_pass)
            try:
                self._transport.connect(username = username, pkey = pkey)
            except paramiko.AuthenticationException:
                raise exception.SSHAuthException(username, host)
            except paramiko.SSHException,e:
                msg = e.args[0]
                raise exception.SSHError(msg)
Ejemplo n.º 18
0
    def _eval_add_node(self):
        """
        This function uses the metrics available to it to decide whether to
        add a new node to the cluster or not. It isn't able to add a node yet.
        TODO: See if the recent jobs have taken more than 5 minutes (how
        long it takes to start an instance)
        """
        if len(self.stat.hosts) >= self.max_nodes:
            log.info("Not adding nodes: already at or above maximum (%d)" %
                     self.max_nodes)
            return
        need_to_add = 0
	total_slots_required=0 
	qjobs =self.stat.get_queued_jobs()
	for q in qjobs:
		total_slots_required =  total_slots_required +int(q['slots']) 
        qlen = len(self.stat.get_queued_jobs())
        sph = self.stat.slots_per_host()
        ts = self.stat.count_total_slots()
        num_exec_hosts = len(self.stat.hosts)
        #calculate estimated time to completion
        ettc = 0
        if num_exec_hosts > 0:
            #calculate job duration
            avg_duration = self.stat.avg_job_duration()
            ettc = avg_duration * total_slots_required / num_exec_hosts
        if total_slots_required > ts:
            if not self.has_cluster_stabilized():
                return
            #there are more jobs queued than will be consumed with one
            #cycle of job processing from all nodes
            oldest_job_dt = self.stat.oldest_queued_job_age()
            now = self.get_remote_time()
            age_delta = now - oldest_job_dt
            if age_delta.seconds > self.longest_allowed_queue_time:
                log.info("A job has been waiting for %d sec, longer than "
                         "max %d" % (age_delta.seconds,
                                     self.longest_allowed_queue_time))
                need_to_add = total_slots_required / sph if sph != 0 else 1
                if 0 < ettc < 600 and not self.stat.on_first_job():
                    log.warn("There is a possibility that the job queue is"
                             " shorter than 10 minutes in duration")
        max_add = self.max_nodes - len(self._cluster.running_nodes)
        need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
        if need_to_add > 0:
            log.warn("Adding %d nodes at %s" %
                     (need_to_add, str(datetime.datetime.utcnow())))
            try:
                self._cluster.add_nodes(need_to_add)
                self.__last_cluster_mod_time = datetime.datetime.utcnow()
                log.info("Done adding nodes at %s" %
                         str(datetime.datetime.utcnow()))
            except Exception:
                log.error("Failed to add new host")
                log.debug(traceback.format_exc())
Ejemplo n.º 19
0
 def _eval_add_node(self):
     """
     This function uses the metrics available to it to decide whether to
     add a new node to the cluster or not. It isn't able to add a node yet.
     TODO: See if the recent jobs have taken more than 5 minutes (how
     long it takes to start an instance)
     """
     if len(self.stat.hosts) >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     need_to_add = 0
     total_slots_required = 0
     qjobs = self.stat.get_queued_jobs()
     for q in qjobs:
         total_slots_required = total_slots_required + int(q['slots'])
     qlen = len(self.stat.get_queued_jobs())
     sph = self.stat.slots_per_host()
     ts = self.stat.count_total_slots()
     num_exec_hosts = len(self.stat.hosts)
     #calculate estimated time to completion
     ettc = 0
     if num_exec_hosts > 0:
         #calculate job duration
         avg_duration = self.stat.avg_job_duration()
         ettc = avg_duration * total_slots_required / num_exec_hosts
     if total_slots_required > ts:
         if not self.has_cluster_stabilized():
             return
         #there are more jobs queued than will be consumed with one
         #cycle of job processing from all nodes
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d sec, longer than "
                      "max %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             need_to_add = total_slots_required / sph if sph != 0 else 1
             if 0 < ettc < 600 and not self.stat.on_first_job():
                 log.warn("There is a possibility that the job queue is"
                          " shorter than 10 minutes in duration")
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(datetime.datetime.utcnow())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = datetime.datetime.utcnow()
             log.info("Done adding nodes at %s" %
                      str(datetime.datetime.utcnow()))
         except Exception:
             log.error("Failed to add new host")
             log.debug(traceback.format_exc())
Ejemplo n.º 20
0
 def _eval_add_node(self):
     """
     This function uses the metrics available to it to decide whether to
     add a new node to the cluster or not. It isn't able to add a node yet.
     TODO: See if the recent jobs have taken more than 5 minutes (how
     long it takes to start an instance)
     """
     need_to_add = 0
     if len(self.stat.hosts) >= self.max_nodes:
         log.info("Won't add another host, currently at max (%d)." % \
                self.max_nodes)
         return 0
     qlen = len(self.stat.get_queued_jobs())
     sph = self.stat.slots_per_host()
     ts = self.stat.count_total_slots()
     #calculate job duration
     avg_duration = self.stat.avg_job_duration()
     #calculate estimated time to completion
     ettc = avg_duration * qlen / len(self.stat.hosts)
     if qlen > ts:
         now = datetime.datetime.utcnow()
         if (now - self.__last_cluster_mod_time).seconds < \
         self.stabilization_time:
             log.info(
                 "Cluster change made less than %d seconds ago (%s)." %
                 (self.stabilization_time, self.__last_cluster_mod_time))
             log.info("Not changing cluster size until cluster stabilizes.")
             return 0
         #there are more jobs queued than will be consumed with one
         #cycle of job processing from all nodes
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d sec, longer than " \
                      "max %d." % (age_delta.seconds,
                         self.longest_allowed_queue_time))
             need_to_add = qlen / sph
             if ettc < 600 and not self.stat.on_first_job():
                 log.warn("There is a possibility that the job queue is" + \
                          " shorter than 10 minutes in duration.")
                 #need_to_add = 0
     if need_to_add > 0:
         need_to_add = min(self.add_nodes_per_iteration, need_to_add)
         log.info("*** ADDING %d NODES." % need_to_add)
         try:
             self._cluster.add_nodes(need_to_add)
         except Exception:
             log.error("Failed to add new host.")
             log.debug(traceback.format_exc())
             return -1
         self.__last_cluster_mod_time = datetime.datetime.utcnow()
         log.info("Done adding nodes.")
     return need_to_add
Ejemplo n.º 21
0
 def shell(self,
           user=None,
           forward_x11=False,
           forward_agent=False,
           command=None):
     """
     Attempts to launch an interactive shell by first trying the system's
     ssh client. If the system does not have the ssh command it falls back
     to a pure-python ssh shell.
     """
     if self.update() != 'running':
         try:
             alias = self.alias
         except exception.BaseException:
             alias = None
         label = 'instance'
         if alias == "master":
             label = "master"
             alias = "node"
         elif alias:
             label = "node"
         instance_id = alias or self.id
         raise exception.InstanceNotRunning(instance_id,
                                            self.state,
                                            label=label)
     user = user or self.user
     if utils.has_required(['ssh']):
         log.debug("Using native OpenSSH client")
         sshopts = '-i %s' % self.key_location
         if forward_x11:
             sshopts += ' -Y'
         if forward_agent:
             sshopts += ' -A'
         ssh_cmd = static.SSH_TEMPLATE % dict(
             opts=sshopts, user=user, host=self.dns_name)
         if command:
             command = "'source /etc/profile && %s'" % command
             ssh_cmd = ' '.join([ssh_cmd, command])
         log.debug("ssh_cmd: %s" % ssh_cmd)
         return subprocess.call(ssh_cmd, shell=True)
     else:
         log.debug("Using Pure-Python SSH client")
         if forward_x11:
             log.warn("X11 Forwarding not available in Python SSH client")
         if forward_agent:
             log.warn("Authentication agent forwarding not available in " +
                      "Python SSH client")
         if command:
             orig_user = self.ssh.get_current_user()
             self.ssh.switch_user(user)
             self.ssh.execute(command, silent=False)
             self.ssh.switch_user(orig_user)
             return self.ssh.get_last_status()
         self.ssh.interactive_shell(user=user)
Ejemplo n.º 22
0
 def _eval_add_node(self):
     """
     This function uses the metrics available to it to decide whether to
     add a new node to the cluster or not. It isn't able to add a node yet.
     TODO: See if the recent jobs have taken more than 5 minutes (how
     long it takes to start an instance)
     """
     need_to_add = 0
     if len(self.stat.hosts) >= self.max_nodes:
         log.info("Won't add another host, currently at max (%d)." % \
                self.max_nodes)
         return 0
     qlen = len(self.stat.get_queued_jobs())
     sph = self.stat.slots_per_host()
     ts = self.stat.count_total_slots()
     #calculate job duration
     avg_duration = self.stat.avg_job_duration()
     #calculate estimated time to completion
     ettc = avg_duration * qlen / len(self.stat.hosts)
     if qlen > ts:
         now = datetime.datetime.utcnow()
         if (now - self.__last_cluster_mod_time).seconds < \
         self.stabilization_time:
             log.info("Cluster change made less than %d seconds ago (%s)."
                     % (self.stabilization_time,
                        self.__last_cluster_mod_time))
             log.info("Not changing cluster size until cluster stabilizes.")
             return 0
         #there are more jobs queued than will be consumed with one
         #cycle of job processing from all nodes
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d sec, longer than " \
                      "max %d." % (age_delta.seconds,
                         self.longest_allowed_queue_time))
             need_to_add = qlen / sph
             if ettc < 600 and not self.stat.on_first_job():
                 log.warn("There is a possibility that the job queue is" + \
                          " shorter than 10 minutes in duration.")
                 #need_to_add = 0
     if need_to_add > 0:
         need_to_add = min(self.add_nodes_per_iteration, need_to_add)
         log.info("*** ADDING %d NODES." % need_to_add)
         try:
             self._cluster.add_nodes(need_to_add)
         except Exception:
             log.error("Failed to add new host.")
             log.debug(traceback.format_exc())
             return -1
         self.__last_cluster_mod_time = datetime.datetime.utcnow()
         log.info("Done adding nodes.")
     return need_to_add
Ejemplo n.º 23
0
 def get_qatime(self, now):
     """
     this function takes the lookback window and creates a string
     representation of the past few hours, to feed to qacct to
     limit the data set qacct returns.
     """
     if self.lookback_window > 24 or self.lookback_window < 1:
         log.warn("Lookback window %d out of range (1-24). Not recommended."
                  % self.lookback_window)
     now = now - datetime.timedelta(hours=self.lookback_window)
     str = now.strftime("%Y%m%d%H%M")
     return str
Ejemplo n.º 24
0
 def get_qatime(self, now):
     """
     this function takes the lookback window and creates a string
     representation of the past few hours, to feed to qacct to
     limit the data set qacct returns.
     """
     if self.lookback_window > 24 or self.lookback_window < 1:
         log.warn(
             "Lookback window %d out of range (1-24). Not recommended." %
             self.lookback_window)
     now = now - datetime.timedelta(hours=self.lookback_window)
     str = now.strftime("%Y%m%d%H%M")
     return str
Ejemplo n.º 25
0
 def get_settings_from_env(self, settings):
     """
     Returns AWS credentials defined in the user's shell
     environment.
     """
     found = {}
     for key in settings:
         if key.upper() in os.environ:
             log.warn("Setting '%s' from environment..." % key.upper())
             found[key] = os.environ.get(key.upper())
         elif key in os.environ:
             log.warn("Setting '%s' from environment..." % key)
             found[key] = os.environ.get(key)
     return found
Ejemplo n.º 26
0
 def root_device_name(self):
     root_dev = self.instance.root_device_name
     bmap = self.block_device_mapping
     if bmap and root_dev not in bmap and self.is_ebs_backed():
         # Hack for misconfigured AMIs (e.g. CentOS 6.3 Marketplace) These
         # AMIs have root device name set to /dev/sda1 but no /dev/sda1 in
         # block device map - only /dev/sda. These AMIs somehow magically
         # work so check if /dev/sda exists and return that instead to
         # prevent detach_external_volumes() from trying to detach the root
         # volume on these AMIs.
         log.warn("Root device %s is not in the block device map" %
                  root_dev)
         log.warn("This means the AMI was registered with either "
                  "an incorrect root device name or an incorrect block "
                  "device mapping")
         sda, sda1 = '/dev/sda', '/dev/sda1'
         if root_dev == sda1:
             log.info("Searching for possible root device: %s" % sda)
             if sda in self.block_device_mapping:
                 log.warn("Found '%s' - assuming its the real root device" %
                          sda)
                 root_dev = sda
             else:
                 log.warn("Device %s isn't in the block device map either" %
                          sda)
     return root_dev
Ejemplo n.º 27
0
 def shell(self, user=None, forward_x11=False, forward_agent=False,
           command=None):
     """
     Attempts to launch an interactive shell by first trying the system's
     ssh client. If the system does not have the ssh command it falls back
     to a pure-python ssh shell.
     """
     if self.update() != 'running':
         try:
             alias = self.alias
         except exception.BaseException:
             alias = None
         label = 'instance'
         if alias == "master":
             label = "master"
             alias = "node"
         elif alias:
             label = "node"
         instance_id = alias or self.id
         raise exception.InstanceNotRunning(instance_id, self.state,
                                            label=label)
     user = user or self.user
     if utils.has_required(['ssh']):
         log.debug("Using native OpenSSH client")
         sshopts = '-i %s' % self.key_location
         if forward_x11:
             sshopts += ' -Y'
         if forward_agent:
             sshopts += ' -A'
         ssh_cmd = static.SSH_TEMPLATE % dict(opts=sshopts, user=user,
                                              host=self.dns_name)
         if command:
             command = "'source /etc/profile && %s'" % command
             ssh_cmd = ' '.join([ssh_cmd, command])
         log.debug("ssh_cmd: %s" % ssh_cmd)
         return subprocess.call(ssh_cmd, shell=True)
     else:
         log.debug("Using Pure-Python SSH client")
         if forward_x11:
             log.warn("X11 Forwarding not available in Python SSH client")
         if forward_agent:
             log.warn("Authentication agent forwarding not available in " +
                      "Python SSH client")
         if command:
             orig_user = self.ssh.get_current_user()
             self.ssh.switch_user(user)
             self.ssh.execute(command, silent=False)
             self.ssh.switch_user(orig_user)
             return self.ssh.get_last_status()
         self.ssh.interactive_shell(user=user)
Ejemplo n.º 28
0
 def setup_tmuxcc(self,
                  client=None,
                  nodes=None,
                  user='******',
                  layout='tiled'):
     log.info("Creating TMUX Control Center for user '%s'" % user)
     client = client or self._master
     nodes = nodes or self._nodes
     envname = self._envname
     orig_user = client.ssh._username
     if orig_user != user:
         client.ssh.connect(username=user)
     chunks = [chunk for chunk in utils.chunk_list(nodes, items=8)]
     num_windows = len(chunks) + len(nodes)
     if len(nodes) == 0:
         log.error("Cluster has no nodes, exiting...")
         return
     self.create_session(client, envname, num_windows=num_windows)
     if len(nodes) == 1 and client == nodes[0]:
         return
     if not self._supports_layout(client, envname, layout, window=0):
         log.warn("failed to select layout '%s', defaulting to "
                  "'main-vertical'" % layout)
         layout = "main-vertical"
         status = self._select_layout(client, envname, layout, window=0)
         if status != 0:
             raise exception.PluginError("failed to set a layout")
     for i, chunk in enumerate(chunks):
         self._rename_window(client, envname, i, 'all%s' % i)
         for j, node in enumerate(chunk):
             if j != 0:
                 self._split_window(client, envname, i)
             self._select_layout(client, envname, window=i, layout=layout)
             if node.alias != client.alias:
                 self._send_keys(client,
                                 envname,
                                 cmd='ssh %s' % node.alias,
                                 window="%d.%d" % (i, j))
     for i, node in enumerate(nodes):
         window = i + len(chunks)
         self._rename_window(client, envname, window, node.alias)
         if node.alias != client.alias:
             self._send_keys(client,
                             envname,
                             cmd='ssh %s' % node.alias,
                             window=window)
     self._select_window(client, envname, window=0)
     self._select_pane(client, envname, window=0, pane=0)
     if orig_user != user:
         client.ssh.connect(username=orig_user)
Ejemplo n.º 29
0
 def execute(self, args):
     if not args:
         cls = [
             c.cluster_tag for c in self.cm.get_clusters(load_plugins=False,
                                                         load_receipt=False)
         ]
         msg = "please specify a cluster"
         if cls:
             opts = ', '.join(cls)
             msg = " ".join([msg, '(options:', opts, ')'])
         self.parser.error(msg)
     for cluster_name in args:
         try:
             cl = self.cm.get_cluster(cluster_name)
         except exception.ClusterDoesNotExist:
             raise
         except Exception, e:
             log.debug("Failed to load cluster settings!", exc_info=True)
             log.error("Failed to load cluster settings!")
             if self.opts.force:
                 log.warn("Ignoring cluster settings due to --force option")
                 cl = self.cm.get_cluster(cluster_name,
                                          load_receipt=False,
                                          require_keys=False)
             else:
                 if not isinstance(e, exception.IncompatibleCluster):
                     log.error("Use -f to forcefully stop the cluster")
                 raise
         is_stoppable = cl.is_stoppable()
         if not is_stoppable:
             has_stoppable_nodes = cl.has_stoppable_nodes()
             if not self.opts.terminate_unstoppable and has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' contains 'stoppable' and 'unstoppable' "
                     "nodes. Your options are:\n\n"
                     "1. Use the --terminate-unstoppable option to "
                     "stop all 'stoppable' nodes and terminate all "
                     "'unstoppable' nodes\n\n"
                     "2. Use the 'terminate' command to destroy the "
                     "cluster.\n\nPass --help for more info." %
                     cluster_name)
             if not has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' does not contain any 'stoppable' nodes "
                     "and can only be terminated. Please use the "
                     "'terminate' command instead to destroy the cluster."
                     "\n\nPass --help for more info" % cluster_name)
         if not self.opts.confirm:
             resp = raw_input("Stop cluster %s (y/n)? " % cluster_name)
             if resp not in ['y', 'Y', 'yes']:
                 log.info("Aborting...")
                 continue
         cl.stop_cluster(self.opts.terminate_unstoppable,
                         force=self.opts.force)
         log.warn("All non-spot, EBS-backed nodes are now in a "
                  "'stopped' state")
         log.warn("You can restart this cluster by passing -x "
                  "to the 'start' command")
         log.warn("Use the 'terminate' command to *completely* "
                  "terminate this cluster")
Ejemplo n.º 30
0
 def get_settings_from_env(self, settings):
     """
     Returns AWS credentials defined in the user's shell
     environment.
     """
     found = {}
     for key in settings:
         if key.upper() in os.environ:
             log.warn("Setting '%s' from environment..." % key.upper())
             found[key] = os.environ.get(key.upper())
         elif key in os.environ:
             log.warn("Setting '%s' from environment..." % key)
             found[key] = os.environ.get(key)
     return found
Ejemplo n.º 31
0
 def _warn_about_volume_hosts(self):
     sg = self.cluster_group
     vol_hosts = filter(lambda x: x.state in ['running', 'pending'],
                        sg.instances())
     vol_hosts = map(lambda x: x.id, vol_hosts)
     if vol_hosts:
         log.warn("There are still volume hosts running: %s" % \
                  ', '.join(vol_hosts))
         log.warn(("Run 'starcluster terminate %s' to terminate *all* " + \
                  "volume host instances once they're no longer needed") % \
                  static.VOLUME_GROUP_NAME)
     else:
         log.info("No active volume hosts found. Run 'starcluster " + \
                  "terminate %(g)s' to remove the '%(g)s' group" % \
                  {'g': static.VOLUME_GROUP_NAME})
Ejemplo n.º 32
0
 def _add_to_known_hosts(self, node):
     log.info("Configuring local known_hosts file")
     user_home = os.path.expanduser("~")
     khosts = os.path.join(user_home, ".ssh", "known_hosts")
     if not os.path.isfile(khosts):
         log.warn("Unable to configure known_hosts: file does not exist")
         return
     contents = open(khosts).read()
     if node.dns_name not in contents:
         server_pkey = node.ssh.get_server_public_key()
         khostsf = open(khosts, "a")
         if contents[-1] != "\n":
             khostsf.write("\n")
         name_entry = "%s,%s" % (node.dns_name, node.ip_address)
         khostsf.write(" ".join([name_entry, server_pkey.get_name(), base64.b64encode(str(server_pkey)), "\n"]))
         khostsf.close()
Ejemplo n.º 33
0
 def terminate(self, cluster_name, force=False):
     if force:
         log.warn("Ignoring cluster settings due to --force option")
     try:
         cl = self.cm.get_cluster(cluster_name, load_receipt=not force, require_keys=not force)
         if force:
             self._terminate_manually(cl)
         else:
             self._terminate_cluster(cl)
     except exception.ClusterDoesNotExist:
         raise
     except Exception:
         log.error("Failed to terminate cluster!", exc_info=True)
         if not force:
             log.error("Use -f to forcefully terminate the cluster")
         raise
Ejemplo n.º 34
0
 def remove_image_files(self, image_name, pretend=True):
     if pretend:
         log.info("Pretending to remove image files...")
     else:
         log.info('Removing image files...')
     files = self.get_image_files(image_name)
     for f in files:
         if pretend:
             log.info("Would remove file: %s" % f.name)
         else:
             log.info('Removing file %s' % f.name)
             f.delete()
     if not pretend:
         files = self.get_image_files(image_name)
         if len(files) != 0:
             log.warn('Not all files deleted, recursing...')
             self.remove_image_files(image_name, pretend)
Ejemplo n.º 35
0
 def __init__(self, interval=60, plot=False, max_nodes=5, wait_time=900,
              add_pi=1, kill_after=45, stab=180, lookback_win=3,
              min_nodes=1):
     self._cluster = None
     self.polling_interval = interval
     self._visualizer_on = plot
     self.max_nodes = max_nodes
     self.longest_allowed_queue_time = wait_time
     self.add_nodes_per_iteration = add_pi
     self.kill_after = kill_after
     self.stabilization_time = stab
     self.lookback_window = lookback_win
     self.min_nodes = min_nodes
     self.allow_master_kill = False
     if self.longest_allowed_queue_time < 300:
         log.warn("wait_time should be >= 300 seconds " + \
                  "(it takes ~5 min to launch a new EC2 node)")
Ejemplo n.º 36
0
 def run(self, nodes, master, user, shell, volumes):
     mssh = master.ssh
     mssh.switch_user(user)
     botocfg = '/home/%s/.boto' % user
     if not mssh.path_exists(botocfg):
         log.info("Installing AWS credentials for user: %s" % user)
         if self.boto_cfg:
             log.info("Copying %s to %s" % (self.boto_cfg, botocfg))
             mssh.put(self.boto_cfg, botocfg)
         else:
             log.info("Installing current credentials to: %s" % botocfg)
             f = mssh.remote_file(botocfg, 'w')
             f.write(BOTO_CFG_TEMPLATE % master.ec2.__dict__)
             f.close()
         mssh.chmod(0400, botocfg)
     else:
         log.warn("AWS credentials already present - skipping install")
Ejemplo n.º 37
0
 def _warn_about_volume_hosts(self):
     sg = self.ec2.get_group_or_none(static.VOLUME_GROUP)
     if not sg:
         return
     vol_hosts = filter(lambda x: x.state in ['running', 'pending'],
                        sg.instances())
     vol_hosts = map(lambda x: x.id, vol_hosts)
     if vol_hosts:
         log.warn("There are still volume hosts running: %s" % \
                  ', '.join(vol_hosts))
         log.warn(("Run 'starcluster terminate %s' to terminate *all* " + \
                  "volume host instances once they're no longer needed") % \
                  static.VOLUME_GROUP_NAME)
     else:
         log.info("No active volume hosts found. Run 'starcluster " + \
                  "terminate %(g)s' to remove the '%(g)s' group" % \
                  {'g': static.VOLUME_GROUP_NAME})
Ejemplo n.º 38
0
 def run(self, nodes, master, user, shell, volumes):
     mssh = master.ssh
     mssh.switch_user(user)
     botocfg = '/home/%s/.boto' % user
     if not mssh.path_exists(botocfg):
         log.info("Installing AWS credentials for user: %s" % user)
         if self.boto_cfg:
             log.info("Copying %s to %s" % (self.boto_cfg, botocfg))
             mssh.put(self.boto_cfg, botocfg)
         else:
             log.info("Installing current credentials to: %s" % botocfg)
             f = mssh.remote_file(botocfg, 'w')
             f.write(BOTO_CFG_TEMPLATE % master.ec2.__dict__)
             f.close()
         mssh.chmod(0400, botocfg)
     else:
         log.warn("AWS credentials already present - skipping install")
Ejemplo n.º 39
0
 def _install_efs_on_node(self, node):
     if not node.ssh.path_exists(self.mount_point):
         node.ssh.makedirs(self.mount_point, mode=0777)
     zone = node.ssh.execute('ec2metadata --availability-zone')[0]
     region = zone[:-1]
     name_parts = [zone, self.fs_id, 'efs', region, 'amazonaws', 'com']
     efs_dns = '.'.join(name_parts)
     mount_info = node.ssh.execute('grep %s /proc/mounts' %
                                   self.mount_point, raise_on_failure=False,
                                   ignore_exit_status=True)
     cmd = 'mount -t nfs4 -ominorversion=1 %s:/ %s' % (efs_dns,
                                                       self.mount_point)
     if mount_info:
         log.warn('%s is already a mount point' % self.mount_point)
         log.info(mount_info[0])
     else:
         node.ssh.execute(cmd)
Ejemplo n.º 40
0
 def _add_to_known_hosts(self, node):
     log.info("Configuring local known_hosts file")
     user_home = os.path.expanduser('~')
     khosts = os.path.join(user_home, '.ssh', 'known_hosts')
     if not os.path.isfile(khosts):
         log.warn("Unable to configure known_hosts: file does not exist")
         return
     contents = open(khosts).read()
     if node.dns_name not in contents:
         server_pkey = node.ssh.get_server_public_key()
         khostsf = open(khosts, 'a')
         if contents[-1] != '\n':
             khostsf.write('\n')
         name_entry = '%s,%s' % (node.dns_name, node.ip_address)
         khostsf.write(' '.join([name_entry, server_pkey.get_name(),
                                 base64.b64encode(str(server_pkey)), '\n']))
         khostsf.close()
Ejemplo n.º 41
0
 def terminate(self, cluster_name, force=False):
     if force:
         log.warn("Ignoring cluster settings due to --force option")
     try:
         cl = self.cm.get_cluster(cluster_name,
                                  load_receipt=not force,
                                  require_keys=not force)
         if force:
             self._terminate_manually(cl)
         else:
             self._terminate_cluster(cl)
     except exception.ClusterDoesNotExist:
         raise
     except Exception:
         log.error("Failed to terminate cluster!", exc_info=True)
         if not force:
             log.error("Use -f to forcefully terminate the cluster")
         raise
Ejemplo n.º 42
0
 def get_stats(self):
     """
     This method will ssh to the SGE master and get load & queue stats. It
     will feed these stats to SGEStats, which parses the XML. It will return
     two arrays: one of hosts, each host has a hash with its host
     information inside. The job array contains a hash for every job,
     containing statistics about the job name, priority, etc.
     """
     log.debug("starting get_stats")
     retries = 5
     for i in range(retries):
         try:
             return self._get_stats()
         except Exception:
             log.warn("Failed to retrieve stats (%d/%d):" % (i + 1, retries), exc_info=True)
             log.warn("Retrying in %ds" % self.polling_interval)
             time.sleep(self.polling_interval)
     raise exception.BaseException("Failed to retrieve SGE stats after trying %d times, exiting..." % retries)
Ejemplo n.º 43
0
 def _start_notebook(self, master, user, profile_dir):
     log.info("Setting up IPython web notebook for user: %s" % user)
     user_cert = posixpath.join(profile_dir, '%s.pem' % user)
     ssl_cert = posixpath.join(profile_dir, '%s.pem' % user)
     if not master.ssh.isfile(user_cert):
         log.info("Creating SSL certificate for user %s" % user)
         ssl_subj = "/C=US/ST=SC/L=STAR/O=Dis/CN=%s" % master.dns_name
         master.ssh.execute(
             "openssl req -new -newkey rsa:4096 -days 365 "
             '-nodes -x509 -subj %s -keyout %s -out %s' %
             (ssl_subj, ssl_cert, ssl_cert))
     else:
         log.info("Using existing SSL certificate...")
     f = master.ssh.remote_file('%s/ipython_notebook_config.py' %
                                profile_dir)
     notebook_port = 8888
     sha1py = 'from IPython.lib import passwd; print passwd("%s")'
     sha1cmd = "python -c '%s'" % sha1py
     sha1pass = master.ssh.execute(sha1cmd % self.notebook_passwd)[0]
     f.write('\n'.join([
         "c = get_config()",
         "c.IPKernelApp.pylab = 'inline'",
         "c.NotebookApp.certfile = u'%s'" % ssl_cert,
         "c.NotebookApp.ip = '*'",
         "c.NotebookApp.open_browser = False",
         "c.NotebookApp.password = u'%s'" % sha1pass,
         "c.NotebookApp.port = %d" % notebook_port,
     ]))
     f.close()
     if self.notebook_directory is not None:
         if not master.ssh.path_exists(self.notebook_directory):
             master.ssh.makedirs(self.notebook_directory)
         master.ssh.execute_async(
             "ipython notebook --no-browser --notebook-dir='%s'"
             % self.notebook_directory)
     else:
         master.ssh.execute_async("ipython notebook --no-browser")
     self._authorize_port(master, notebook_port, 'notebook')
     log.info("IPython notebook URL: https://%s:%s" %
              (master.dns_name, notebook_port))
     log.info("The notebook password is: %s" % self.notebook_passwd)
     log.warn("Please check your local firewall settings if you're having "
              "issues connecting to the IPython notebook",
              extra=dict(__textwrap__=True))
Ejemplo n.º 44
0
 def _start_notebook(self, master, user, profile_dir):
     log.info("Setting up IPython web notebook for user: %s" % user)
     user_cert = posixpath.join(profile_dir, '%s.pem' % user)
     ssl_cert = posixpath.join(profile_dir, '%s.pem' % user)
     if not master.ssh.isfile(user_cert):
         log.info("Creating SSL certificate for user %s" % user)
         ssl_subj = "/C=US/ST=SC/L=STAR/O=Dis/CN=%s" % master.dns_name
         master.ssh.execute(
             "openssl req -new -newkey rsa:4096 -days 365 "
             '-nodes -x509 -subj %s -keyout %s -out %s' %
             (ssl_subj, ssl_cert, ssl_cert))
     else:
         log.info("Using existing SSL certificate...")
     f = master.ssh.remote_file('%s/ipython_notebook_config.py' %
                                profile_dir)
     notebook_port = 8888
     sha1py = 'from IPython.lib import passwd; print passwd("%s")'
     sha1cmd = "python -c '%s'" % sha1py
     sha1pass = master.ssh.execute(sha1cmd % self.notebook_passwd)[0]
     f.write('\n'.join([
         "c = get_config()",
         "c.IPKernelApp.pylab = 'inline'",
         "c.NotebookApp.certfile = u'%s'" % ssl_cert,
         "c.NotebookApp.ip = '*'",
         "c.NotebookApp.open_browser = False",
         "c.NotebookApp.password = u'%s'" % sha1pass,
         "c.NotebookApp.port = %d" % notebook_port,
     ]))
     f.close()
     if self.notebook_directory is not None:
         if not master.ssh.path_exists(self.notebook_directory):
             master.ssh.makedirs(self.notebook_directory)
         master.ssh.execute_async(
             "ipython notebook --no-browser --notebook-dir='%s'"
             % self.notebook_directory)
     else:
         master.ssh.execute_async("ipython notebook --no-browser")
     self._authorize_port(master, notebook_port, 'notebook')
     log.info("IPython notebook URL: https://%s:%s" %
              (master.dns_name, notebook_port))
     log.info("The notebook password is: %s" % self.notebook_passwd)
     log.warn("Please check your local firewall settings if you're having "
              "issues connecting to the IPython notebook",
              extra=dict(__textwrap__=True))
Ejemplo n.º 45
0
 def setup_tmuxcc(self, client=None, nodes=None, user='******',
                  layout='tiled'):
     log.info("Creating TMUX Control Center for user '%s'" % user)
     client = client or self._master
     nodes = nodes or self._nodes
     envname = self._envname
     orig_user = client.ssh._username
     if orig_user != user:
         client.ssh.connect(username=user)
     chunks = [chunk for chunk in utils.chunk_list(nodes, items=8)]
     num_windows = len(chunks) + len(nodes)
     if len(nodes) == 0:
         log.error("Cluster has no nodes, exiting...")
         return
     self.create_session(client, envname, num_windows=num_windows)
     if len(nodes) == 1 and client == nodes[0]:
         return
     if not self._supports_layout(client, envname, layout, window=0):
         log.warn("failed to select layout '%s', defaulting to "
                  "'main-vertical'" % layout)
         layout = "main-vertical"
         status = self._select_layout(client, envname, layout, window=0)
         if status != 0:
             raise exception.PluginError("failed to set a layout")
     for i, chunk in enumerate(chunks):
         self._rename_window(client, envname, i, 'all%s' % i)
         for j, node in enumerate(chunk):
             if j != 0:
                 self._split_window(client, envname, i)
             self._select_layout(client, envname, window=i, layout=layout)
             if node.alias != client.alias:
                 self._send_keys(client, envname, cmd='ssh %s' % node.alias,
                                 window="%d.%d" % (i, j))
     for i, node in enumerate(nodes):
         window = i + len(chunks)
         self._rename_window(client, envname, window, node.alias)
         if node.alias != client.alias:
             self._send_keys(client, envname, cmd='ssh %s' % node.alias,
                             window=window)
     self._select_window(client, envname, window=0)
     self._select_pane(client, envname, window=0, pane=0)
     if orig_user != user:
         client.ssh.connect(username=orig_user)
Ejemplo n.º 46
0
 def run(self, nodes, master, user, shell, volumes):
     self.config_dict["aws_access_key_id"] = master.ec2.aws_access_key_id
     self.config_dict["aws_secret_access_key"] = master.ec2.aws_secret_access_key
     mssh = master.ssh
     mssh.switch_user(user)
     s3cmd_cfg = "/home/%s/.s3cfg" % user
     if not mssh.path_exists(s3cmd_cfg):
         log.info("Configuring s3cmd for user: %s" % user)
         if self.s3cmd_cfg:
             log.info("Copying %s to %s" % (self.s3cmd_cfg, s3cmd_cfg))
             mssh.put(self.s3cmd_cfg, s3cmd_cfg)
         else:
             log.info("Installing new .s3cfg to: %s" % s3cmd_cfg)
             f = mssh.remote_file(s3cmd_cfg, "w")
             f.write(s3cmd_cfg_TEMPLATE % self.config_dict)
             f.close()
         mssh.chmod(0400, s3cmd_cfg)
     else:
         log.warn("~/.s3cfg file already present - skipping install")
Ejemplo n.º 47
0
 def _add_to_known_hosts(self, node):
     log.info("Configuring local known_hosts file")
     user_home = os.path.expanduser('~')
     khosts = os.path.join(user_home, '.ssh', 'known_hosts')
     if not os.path.isfile(khosts):
         log.warn("Unable to configure known_hosts: file does not exist")
         return
     contents = open(khosts).read()
     if node.dns_name not in contents:
         server_pkey = node.ssh.get_server_public_key()
         khostsf = open(khosts, 'a')
         if contents[-1] != '\n':
             khostsf.write('\n')
         name_entry = '%s,%s' % (node.dns_name, node.ip_address)
         khostsf.write(' '.join([
             name_entry,
             server_pkey.get_name(),
             base64.b64encode(str(server_pkey)), '\n'
         ]))
         khostsf.close()
Ejemplo n.º 48
0
 def run(self, nodes, master, user, shell, volumes):
     self.config_dict["aws_access_key_id"] = \
         master.ec2.aws_access_key_id
     self.config_dict["aws_secret_access_key"] = \
         master.ec2.aws_secret_access_key
     mssh = master.ssh
     mssh.switch_user(user)
     s3cmd_cfg = "/home/%s/.s3cfg" % user
     if not mssh.path_exists(s3cmd_cfg):
         log.info("Configuring s3cmd for user: %s" % user)
         if self.s3cmd_cfg:
             log.info("Copying %s to %s" % (self.s3cmd_cfg, s3cmd_cfg))
             mssh.put(self.s3cmd_cfg, s3cmd_cfg)
         else:
             log.info("Installing new .s3cfg to: %s" % s3cmd_cfg)
             f = mssh.remote_file(s3cmd_cfg, 'w')
             f.write(s3cmd_cfg_TEMPLATE % self.config_dict)
             f.close()
         mssh.chmod(0400, s3cmd_cfg)
     else:
         log.warn("~/.s3cfg file already present - skipping install")
Ejemplo n.º 49
0
 def _warn_about_volume_hosts(self):
     sg = self.ec2.get_group_or_none(static.VOLUME_GROUP)
     vol_hosts = []
     if sg:
         vol_hosts = filter(lambda x: x.state in ['running', 'pending'],
                            sg.instances())
     if self._instance:
         vol_hosts.append(self._instance)
     vol_hosts = list(set([h.id for h in vol_hosts]))
     if vol_hosts:
         log.warn("There are still volume hosts running: %s" %
                  ', '.join(vol_hosts))
         if not self._instance:
             log.warn("Run 'starcluster terminate -f %s' to terminate all "
                      "volume host instances" % static.VOLUME_GROUP_NAME,
                      extra=dict(__textwrap__=True))
     elif sg:
         log.info("No active volume hosts found. Run 'starcluster "
                  "terminate -f %(g)s' to remove the '%(g)s' group" %
                  {'g': static.VOLUME_GROUP_NAME},
                  extra=dict(__textwrap__=True))
Ejemplo n.º 50
0
 def get_stats(self):
     """
     This method will ssh to the SGE master and get load & queue stats. It
     will feed these stats to SGEStats, which parses the XML. It will return
     two arrays: one of hosts, each host has a hash with its host
     information inside. The job array contains a hash for every job,
     containing statistics about the job name, priority, etc.
     """
     log.debug("starting get_stats")
     retries = 5
     for i in range(retries):
         try:
             return self._get_stats()
         except Exception:
             log.warn("Failed to retrieve stats (%d/%d):" %
                      (i + 1, retries),
                      exc_info=True)
             log.warn("Retrying in %ds" % self.polling_interval)
             time.sleep(self.polling_interval)
     raise exception.BaseException(
         "Failed to retrieve SGE stats after trying %d times, exiting..." %
         retries)
Ejemplo n.º 51
0
 def on_add_node(self, node, nodes, master, user, user_shell, volumes):
     log.warn("starting shell")
     node.ssh.execute(
         "sudo -u sgeadmin ssh -L8999:localhost:8999 master -fN > /tmp/tunnel.stdout 2> /tmp/tunnel.stderr"
     )
     log.warn("setting up etc/hosts")
     update_etc_hosts(node)
     log.warn("done")
Ejemplo n.º 52
0
 def _install_efs_on_node(self, node):
     if not node.ssh.path_exists(self.mount_point):
         node.ssh.makedirs(self.mount_point, mode=0777)
     zone = node.ssh.execute('ec2metadata --availability-zone')[0]
     region = zone[:-1]
     name_parts = [zone, self.fs_id, 'efs', region, 'amazonaws', 'com']
     efs_dns = '.'.join(name_parts)
     mount_info = node.ssh.execute('grep %s /proc/mounts' %
                                   self.mount_point,
                                   raise_on_failure=False,
                                   ignore_exit_status=True)
     if self.mount_options is None:
         mount_options = ('minorversion=1,rsize=1048576,wsize=1048576'
                          ',hard,timeo=600,retrans=2')
     else:
         mount_options = self.mount_options
     cmd = 'mount -t nfs4 -o %s %s:/ %s' % (mount_options, efs_dns,
                                            self.mount_point)
     if mount_info:
         log.warn('%s is already a mount point' % self.mount_point)
         log.info(mount_info[0])
     else:
         node.ssh.execute(cmd)
Ejemplo n.º 53
0
 def load(self):
     """
     Populate this config object from the StarCluster config
     """
     log.debug('Loading config')
     try:
         self.globals = self._load_section('global', self.global_settings)
     except exception.ConfigSectionMissing:
         pass
     try:
         self.aws = self._load_section('aws info', self.aws_settings)
     except exception.ConfigSectionMissing:
         log.warn("no [aws info] section found in config")
         log.warn("attempting to load credentials from environment...")
         self.aws.update(self.get_aws_from_environ())
     self.keys = self._load_sections('key', self.key_settings)
     self.vols = self._load_sections('volume', self.volume_settings)
     self.plugins = self._load_sections('plugin', self.plugin_settings)
     self.permissions = self._load_sections('permission',
                                            self.permission_settings)
     sections = self._get_sections('cluster')
     self.clusters = self._load_cluster_sections(sections)
     return self
Ejemplo n.º 54
0
 def __init__(self,
              interval=60,
              plot=False,
              max_nodes=5,
              wait_time=900,
              add_pi=1,
              kill_after=45,
              stab=180,
              lookback_win=3,
              min_nodes=1):
     self._cluster = None
     self.polling_interval = interval
     self._visualizer_on = plot
     self.max_nodes = max_nodes
     self.longest_allowed_queue_time = wait_time
     self.add_nodes_per_iteration = add_pi
     self.kill_after = kill_after
     self.stabilization_time = stab
     self.lookback_window = lookback_win
     self.min_nodes = min_nodes
     self.allow_master_kill = False
     if self.longest_allowed_queue_time < 300:
         log.warn("wait_time should be >= 300 seconds " + \
                  "(it takes ~5 min to launch a new EC2 node)")
Ejemplo n.º 55
0
 def _create_image_from_ebs(self, size=15):
     log.info("Creating new EBS AMI...")
     imgid = self.ec2.create_image(self.host.id, self.name,
                                   self.description)
     img = self.ec2.get_image(imgid)
     log.info("New EBS AMI created: %s" % imgid)
     root_dev = self.host.root_device_name
     if root_dev in self.host.block_device_mapping:
         log.info("Fetching block device mapping for %s" % imgid,
                  extra=dict(__nonewline__=True))
         s = Spinner()
         try:
             s.start()
             while root_dev not in img.block_device_mapping:
                 img = self.ec2.get_image(imgid)
                 time.sleep(5)
         finally:
             s.stop()
         snapshot_id = img.block_device_mapping[root_dev].snapshot_id
         snap = self.ec2.get_snapshot(snapshot_id)
         self.ec2.wait_for_snapshot(snap)
     else:
         log.warn("Unable to find root device - cant wait for snapshot")
     log.info("Waiting for %s to become available..." % imgid,
              extra=dict(__nonewline__=True))
     s = Spinner()
     try:
         s.start()
         while img.state == "pending":
             time.sleep(15)
             if img.update() == "failed":
                 raise exception.AWSError(
                     "EBS image creation failed for %s" % imgid)
     finally:
         s.stop()
     return imgid
Ejemplo n.º 56
0
    def setup_crontab(self, master, nodes):
        cluster_name = master.parent_cluster.name
        assert cluster_name.startswith(SECURITY_GROUP_PREFIX)
        cluster_name = cluster_name[len(SECURITY_GROUP_PREFIX):]

        domain = '%s-heartbeats' % cluster_name
        # make sure that the domain exists and the user can access it
        sdbc = boto.sdb.connect_to_region(self.region,
                                          aws_access_key_id=self.key,
                                          aws_secret_access_key=self.secret)
        assert sdbc != None
        try:
            dom = sdbc.get_domain(domain)
        except boto.exception.SDBResponseError:
            log.warn("Creating new domain %s for heartbeats", domain)
            dom = sdbc.create_domain(domain)
        log.warn(
            "Verifying that domain %s is accessible with non-admin credentials",
            domain)
        item = dom.get_item('heartbeat')

        script_template_name = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "deadmanswitch-check.template")
        script_template = open(script_template_name).read()

        # apply config settings to template
        script_body = script_template.format(**dict(key=self.key,
                                                    secret=self.secret,
                                                    topic=self.topic,
                                                    domain=domain,
                                                    region=self.region,
                                                    cluster_name=cluster_name))

        script = tempfile.NamedTemporaryFile("w")
        script.write(script_body)
        script.flush()

        for node in nodes:
            node.ssh.put(script.name,
                         "/tmp/cluster_scripts/deadmanswitch-check.py")
            node.ssh.execute(
                "chmod a+xr /tmp/cluster_scripts/deadmanswitch-check.py")
            log.warn("Adding cronjob for checking deadmans switch on %s",
                     str(node))
            command = "echo '0,10,20,30,40,50 * * * * ubuntu /usr/bin/python /tmp/cluster_scripts/deadmanswitch-check.py > /tmp/deadmanswitch-check.log 2>&1' > /etc/cron.d/cluster-deadmans-switch && service cron reload"
            node.ssh.execute(command)
Ejemplo n.º 57
0
 def warn_experimental(self, msg, num_secs=10):
     """
     Warn user that an experimental feature is being used
     Counts down from num_secs before continuing
     """
     sep = '*' * 60
     log.warn('\n'.join([sep, msg, sep]), extra=dict(__textwrap__=True))
     r = range(1, num_secs + 1)
     r.reverse()
     print
     log.warn("Waiting %d seconds before continuing..." % num_secs)
     log.warn("Press CTRL-C to cancel...")
     for i in r:
         sys.stdout.write('%d...' % i)
         sys.stdout.flush()
         time.sleep(1)
     print