Exemple #1
0
 def is_up(self):
     if self.update() != 'running':
         log.info(self.alias + " is not running")
         return False
     try:
         if not self.is_ssh_up():
             log.info(self.alias + " ssh is not up")
             return False
     except socket.error as e:
         log.warning("Checking is node {} is up encountered exception {}"
                     .format(self.alias, e), exc_info=True)
         return False
     if self.private_ip_address is None:
         log.debug("instance %s has no private_ip_address" % self.id)
         log.debug("attempting to determine private_ip_address for "
                   "instance %s" % self.id)
         try:
             private_ip = self.ssh.execute(
                 'python -c '
                 '"import socket; print socket.gethostbyname(\'%s\')"' %
                 self.private_dns_name)[0].strip()
             log.debug("determined instance %s's private ip to be %s" %
                       (self.id, private_ip))
             self.instance.private_ip_address = private_ip
         except Exception, e:
             print e
             log.info(self.alias + " encountered an exception")
             return False
Exemple #2
0
    def update_dns(self, host_name, ip_address):
        ttl = 10
        host_name = ".".join([host_name, self.domain])
        conn = boto.connect_route53()

        response = conn.get_all_rrsets(self.hosted_zone_id,
                                       'A',
                                       host_name,
                                       maxitems=1)
        if len(response):
            response = response[0]
            comment = "Starcluster route53 plugin deleted record for %s" % (
                host_name)
            changes = ResourceRecordSets(conn, self.hosted_zone_id, comment)
            change1 = changes.add_change("DELETE", host_name, 'A',
                                         response.ttl)
            for old_value in response.resource_records:
                change1.add_value(old_value)
            try:
                changes.commit()
                log.info(comment)
            except Exception as e:
                log.warning(e)

        comment = "Starcluster route53 plugin updated record for %s to %s" % (
            host_name, ip_address)
        changes = ResourceRecordSets(conn, self.hosted_zone_id, comment)
        change2 = changes.add_change("CREATE", host_name, 'A', ttl)
        change2.add_value(ip_address)
        try:
            changes.commit()
            log.info(comment)
        except Exception as e:
            log.warning(e)
Exemple #3
0
 def is_up(self):
     if self.update() != 'running':
         log.info(self.alias + " is not running")
         return False
     try:
         if not self.is_ssh_up():
             log.info(self.alias + " ssh is not up")
             return False
     except socket.error as e:
         log.warning("Checking is node {} is up encountered exception {}"
                     .format(self.alias, e), exc_info=True)
         return False
     if self.private_ip_address is None:
         log.debug("instance %s has no private_ip_address" % self.id)
         log.debug("attempting to determine private_ip_address for "
                   "instance %s" % self.id)
         try:
             private_ip = self.ssh.execute(
                 'python -c '
                 '"import socket; print socket.gethostbyname(\'%s\')"' %
                 self.private_dns_name)[0].strip()
             log.debug("determined instance %s's private ip to be %s" %
                       (self.id, private_ip))
             self.instance.private_ip_address = private_ip
         except Exception, e:
             print e
             log.info(self.alias + " encountered an exception")
             return False
Exemple #4
0
    def update_dns(self, host_name, ip_address):
	ttl = 10
	host_name = ".".join([host_name, self.domain])
        conn = boto.connect_route53()

        response = conn.get_all_rrsets(self.hosted_zone_id, 'A', host_name, maxitems=1)
        if len(response):
            response = response[0]
            comment = "Starcluster route53 plugin deleted record for %s"%(host_name)
            changes = ResourceRecordSets(conn, self.hosted_zone_id, comment)
            change1 = changes.add_change("DELETE", host_name, 'A', response.ttl)
            for old_value in response.resource_records:
                change1.add_value(old_value)
            try:
                changes.commit()
                log.info(comment)
            except Exception as e:
                log.warning(e)

        comment = "Starcluster route53 plugin updated record for %s to %s"%(host_name, ip_address)
        changes = ResourceRecordSets(conn, self.hosted_zone_id, comment)
        change2 = changes.add_change("CREATE", host_name, 'A', ttl)
        change2.add_value(ip_address)
        try:
            changes.commit()
            log.info(comment)
        except Exception as e:
            log.warning(e)
 def is_ssh_up(self):
     try:
         return self.ssh.transport is not None
     except exception.SSHError:
         return False
     except socket.error:
         log.warning("error encountered while checking if {} is up:"
                     .format(self.alias), exc_info=True)
         return False
Exemple #6
0
 def is_ssh_up(self):
     try:
         return self.ssh.transport is not None
     except exception.SSHError:
         return False
     except socket.error:
         log.warning("error encountered while checking if {} is up:"
                     .format(self.alias), exc_info=True)
         return False
Exemple #7
0
    def get_nodes_to_recover(self, nodes):
        """
        Active nodes that are not in OGS.
        """
        if len(nodes) == 1:
            return []

        master = nodes[0]
        qhosts = master.ssh.execute("qhost", source_profile=True)
        qhosts = qhosts[3:]
        missing = []
        parsed_qhosts = {}
        for line in qhosts:
            line = filter(lambda x: len(x) > 0, line.split(" "))
            parsed_qhosts[line[0]] = line[1:]
        for node in nodes:
            short_al = node.short_alias
            # nodes missing from qhost
            if short_al not in parsed_qhosts:
                if node.is_master():
                    assert(not self.master_is_exec_host)
                else:
                    missing.append(node)
            elif parsed_qhosts[short_al][-1] == "-" and not node.is_master():
                # nodes present but w/o stats
                if node.is_up():
                    try:
                        node.ssh.execute("qhost", source_profile=True)
                        log.warning("Restarting sge_execd over " + node.alias)
                        node.ssh.execute(
                            "pkill -9 sge_execd "
                            "&& /opt/sge6/bin/linux-x64/sge_execd",
                            source_profile=True)
                    except RemoteCommandFailed:
                        # normal -> means OGS doesn't run on the node
                        # slow path
                        log.debug("RemoteCommandFailed, "
                                  "OGS likely not running remotely")
                        missing.append(node)
                    except socket.error:
                        # socket closed by remote... try to add it back
                        log.warning("socket.error", exc_info=True)
                        missing.append(node)
                else:
                    missing.append(node)

        return missing
Exemple #8
0
    def get_nodes_to_recover(self, nodes):
        """
        Active nodes that are not in OGS.
        """
        if len(nodes) == 1:
            return []

        master = nodes[0]
        qhosts = master.ssh.execute("qhost", source_profile=True)
        qhosts = qhosts[3:]
        missing = []
        parsed_qhosts = {}
        for line in qhosts:
            line = filter(lambda x: len(x) > 0, line.split(" "))
            parsed_qhosts[line[0]] = line[1:]
        for node in nodes:
            short_al = node.short_alias
            # nodes missing from qhost
            if short_al not in parsed_qhosts:
                if node.is_master():
                    assert not self.master_is_exec_host
                else:
                    missing.append(node)
            elif parsed_qhosts[short_al][-1] == "-" and not node.is_master():
                # nodes present but w/o stats
                if node.is_up():
                    try:
                        node.ssh.execute("qhost", source_profile=True)
                        log.warning("Restarting sge_execd over " + node.alias)
                        node.ssh.execute(
                            "pkill -9 sge_execd " "&& /opt/sge6/bin/linux-x64/sge_execd", source_profile=True
                        )
                    except RemoteCommandFailed:
                        # normal -> means OGS doesn't run on the node
                        # slow path
                        log.debug("RemoteCommandFailed, " "OGS likely not running remotely")
                        missing.append(node)
                    except socket.error:
                        # socket closed by remote... try to add it back
                        log.warning("socket.error", exc_info=True)
                        missing.append(node)
                else:
                    missing.append(node)

        return missing
Exemple #9
0
    def _remove_from_known_hosts(self, node):
        nodes = filter(lambda x: x.id != node.id, self.running_nodes)
        master = None

        for n in nodes:
            if n.is_master():
                master = n

        master.remove_from_known_hosts('root', [node])
        master.remove_from_known_hosts(self._user, [node])

        target = posixpath.join('/root', '.ssh', 'known_hosts')
        master.copy_remote_file_to_nodes(target, nodes)

        user_homedir = os.path.expanduser('~' + self._user)
        target = posixpath.join(user_homedir, '.ssh', 'known_hosts')
        try:
            master.copy_remote_file_to_nodes(target, nodes)
        except IOError:
            log.warning("Failed to copy file " + target)
    def _remove_from_known_hosts(self, node):
        nodes = filter(lambda x: x.id != node.id, self.running_nodes)
        master = None

        for n in nodes:
            if n.is_master():
                master = n

        master.remove_from_known_hosts('root', [node])
        master.remove_from_known_hosts(self._user, [node])

        target = posixpath.join('/root', '.ssh', 'known_hosts')
        master.copy_remote_file_to_nodes(target, nodes)

        user_homedir = os.path.expanduser('~' + self._user)
        target = posixpath.join(user_homedir, '.ssh', 'known_hosts')
        try:
            master.copy_remote_file_to_nodes(target, nodes)
        except IOError:
            log.warning("Failed to copy file " + target)
Exemple #11
0
def get_config( cluster_tag=None, template_name=None ):
    """
    Loads a list of configurations started with the given <cluster_tag>,
    most recent configuration is first
    """
    cfg = StarClusterConfig().load()
    if not cluster_tag and not template_name:
        log.warning("Attempt to clone without a template_name or cluster_tag")
        return []
    s3_bucket = cfg.aws['aws_meta_bucket']
    table_name = cfg.aws['aws_config_table']
    if table_name is None:
        log.warning(("AWS_CONFIG_TABLE is not defined."
                        " This cluster will not be cloneable."))
        return False
    if s3_bucket is None:
        log.warning(("AWS_META_BUCKET is not defined."
                        "This cluster will not be cloneable"))
        return False
    conn = boto.dynamodb2.connect_to_region('us-east-1')
    cluster_info_table = Table(table_name, connection=conn)
    #print cluster_info_table.describe()
    if cluster_tag:
        clusters = cluster_info_table.query( cluster_tag__eq=cluster_tag)
    else:
        clusters = cluster_info_table.scan( template_name__eq=template_name )
    if clusters:
        return [c for c in clusters]
    else:
        return []
Exemple #12
0
def store_config(cluster_template,
                 cluster_tag,
                 cluster_region,
                 template_name=None,
                 zone=None):
    """
    Stores json encoded cluster_template used to start up <cluster tag>
    in <cluster_regioHHn>.
    <zone> is zone name if cluster is tied to a zone
    <cluster_region> in s3.

    Pointer to template is stored in dynamo.
    """
    cfg = StarClusterConfig().load()
    s3_bucket = cfg.aws['aws_meta_bucket']
    table_name = cfg.aws['aws_config_table']
    if table_name is None:
        log.warning(("AWS_CONFIG_TABLE is not defined."
                     " This cluster will not be cloneable."))
        return False
    if s3_bucket is None:
        log.warning(("AWS_META_BUCKET is not defined."
                     "This cluster will not be cloneable"))
        return False
    conn = boto.dynamodb2.connect_to_region('us-east-1')
    cluster_info_table = Table(table_name, connection=conn)
    if cluster_info_table is None:
        return log.warning("%s table not found in us-east-1" % table_name)
    config_path = config_to_s3(json.dumps(cluster_template), s3_bucket)
    #data to be stored on dyname
    table_data = {}
    table_data['cluster_tag'] = cluster_tag
    table_data['template_name'] = template_name if template_name else 'NA'
    table_data['timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S +0000",
                                            time.gmtime())
    table_data['config_path'] = config_path
    table_data['region'] = cluster_region
    table_data['zone'] = zone.name if zone else 'NA'
    cluster_info_table.put_item(data=table_data)
    return True
Exemple #13
0
def get_config(cluster_tag=None, template_name=None):
    """
    Loads a list of configurations started with the given <cluster_tag>,
    most recent configuration is first
    """
    cfg = StarClusterConfig().load()
    if not cluster_tag and not template_name:
        log.warning("Attempt to clone without a template_name or cluster_tag")
        return []
    s3_bucket = cfg.aws['aws_meta_bucket']
    table_name = cfg.aws['aws_config_table']
    if table_name is None:
        log.warning(("AWS_CONFIG_TABLE is not defined."
                     " This cluster will not be cloneable."))
        return False
    if s3_bucket is None:
        log.warning(("AWS_META_BUCKET is not defined."
                     "This cluster will not be cloneable"))
        return False
    conn = boto.dynamodb2.connect_to_region('us-east-1')
    cluster_info_table = Table(table_name, connection=conn)
    #print cluster_info_table.describe()
    if cluster_tag:
        clusters = cluster_info_table.query(cluster_tag__eq=cluster_tag)
    else:
        clusters = cluster_info_table.scan(template_name__eq=template_name)
    if clusters:
        return [c for c in clusters]
    else:
        return []
Exemple #14
0
def store_config( cluster_template, cluster_tag, cluster_region, 
                    template_name=None, zone=None ):
    """
    Stores json encoded cluster_template used to start up <cluster tag>
    in <cluster_regioHHn>.
    <zone> is zone name if cluster is tied to a zone
    <cluster_region> in s3.

    Pointer to template is stored in dynamo.
    """
    cfg = StarClusterConfig().load()
    s3_bucket = cfg.aws['aws_meta_bucket']
    table_name = cfg.aws['aws_config_table']
    if table_name is None:
        log.warning(("AWS_CONFIG_TABLE is not defined."
                " This cluster will not be cloneable."))
        return False
    if s3_bucket is None:
        log.warning(("AWS_META_BUCKET is not defined."
                "This cluster will not be cloneable"))
        return False
    conn = boto.dynamodb2.connect_to_region('us-east-1')
    cluster_info_table = Table(table_name, connection=conn)
    if cluster_info_table is None:
        return log.warning("%s table not found in us-east-1" % table_name)
    config_path = config_to_s3(json.dumps( cluster_template ), 
        s3_bucket )
    #data to be stored on dyname
    table_data = {}
    table_data['cluster_tag'] = cluster_tag
    table_data['template_name'] = template_name if template_name else 'NA'
    table_data['timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S +0000",
                                    time.gmtime())
    table_data['config_path'] = config_path
    table_data['region'] = cluster_region
    table_data['zone'] = zone.name if zone else 'NA'
    cluster_info_table.put_item(data=table_data)
    return True 
Exemple #15
0
    def clean_cluster(self, nodes, master, user, user_shell, volumes):
        """
        Run qhost to find nodes that are present in OGS but not in the cluster
        in order to remove them.
        """
        self._master = master
        self._nodes = nodes

        qhosts = master.ssh.execute("qhost", source_profile=True)
        qhosts = qhosts[3:]
        qhosts = [line.split(" ")[0] for line in qhosts]

        if len(qhosts) == 0:
            log.info("Nothing to clean")

        alive_nodes = [node.short_alias for node in nodes]

        cleaned = []
        # find dead hosts
        for node_alias in qhosts:
            if node_alias not in alive_nodes:
                cleaned.append(node_alias)

        # find jobs running in dead hosts
        qstats_xml = self._master.ssh.execute('qstat -u "*" -xml', source_profile=True)
        qstats_xml[1:]  # remove first line
        qstats_et = ET.fromstringlist(qstats_xml)
        to_delete = []
        to_repair = []
        cleaned_queue = []  # not a lambda function to allow pickling
        for c in cleaned:
            cleaned_queue.append("all.q@" + c)
        for job_list in qstats_et.find("queue_info").findall("job_list"):
            if job_list.find("queue_name").text in cleaned_queue:
                job_number = job_list.find("JB_job_number").text
                to_delete.append(job_number)
        for job_list in qstats_et.find("job_info").findall("job_list"):
            if job_list.find("state").text == "Eqw":
                job_number = job_list.find("JB_job_number").text
                to_repair.append(job_number)
        # delete the jobs
        if to_delete:
            log.info("Stopping jobs: " + str(to_delete))
            self._master.ssh.execute("qdel -f " + " ".join(to_delete))
            time.sleep(3)  # otherwise might provoke LOST QRSH if on last job
        if to_repair:
            log.error("Reseting jobs: " + str(to_repair))
            self._master.ssh.execute("qmod -cj " + " ".join(to_repair), ignore_exit_status=True)

        # stuck qrsh issue
        ps_wc = int(self._master.ssh.execute("ps -ef | grep qrsh | wc -l")[0])
        qstat_wc = int(self._master.ssh.execute('qstat -u "*" | wc -l')[0])
        if qstat_wc == 0 and ps_wc > 2:
            log.error("LOST QRSH??")
            log.error("pkill -9 qrsh")
            self._master.ssh.execute("pkill -9 qrsh", ignore_exit_status=True)
        # ----------------------------------

        # delete the host config
        for c in cleaned:
            log.info("Cleaning node " + c)
            if len(master.ssh.get_remote_file_lines("/etc/hosts", c)) == 0:
                log.warn(c + " is missing from /etc/hosts, creating a dummy " "entry 1.1.1.1")
                rfile = master.ssh.remote_file("/etc/hosts", "a")
                rfile.write("1.1.1.1 " + c + "\n")
                rfile.close()

            try:
                self._remove_from_sge(DeadNode(c), only_clean_master=True)
            except RemoteCommandFailed:
                log.warning("Failed to remove node {} from sge.".format(c), exc_info=True)

        # fix to allow pickling
        self._master = None
        self._nodes = None
Exemple #16
0
class CmdStart(ClusterCompleter):
    """
    start [options] <cluster_tag>

    Start a new cluster

    Example:

        $ starcluster start mynewcluster

    This will launch a cluster named "mynewcluster" using the settings from
    the default cluster template defined in the configuration file. The
    default cluster template is specified by the 'default_template' option in
    the [global] section of the config. To use another template besides the
    default use the -c (--cluster-template) option:

        $ starcluster start -c largecluster mynewcluster

    This will launch a cluster named "mynewcluster" using the settings from
    the "largecluster" cluster template instead of the default template.
    """
    names = ['start']

    def addopts(self, parser):
        templates = []
        if self.cfg:
            templates = self.cfg.clusters.keys()
        parser.add_option("-x",
                          "--no-create",
                          dest="no_create",
                          action="store_true",
                          default=False,
                          help="do not launch new EC2 instances when "
                          "starting cluster (use existing instances instead)")
        parser.add_option("-o",
                          "--create-only",
                          dest="create_only",
                          action="store_true",
                          default=False,
                          help="only launch/start EC2 instances, "
                          "do not perform any setup routines")
        parser.add_option("-v",
                          "--validate-only",
                          dest="validate_only",
                          action="store_true",
                          default=False,
                          help="only validate cluster settings, do "
                          "not start a cluster")
        parser.add_option("-V",
                          "--skip-validation",
                          dest="validate",
                          action="store_false",
                          default=True,
                          help="do not validate cluster settings")
        parser.add_option("-l",
                          "--login-master",
                          dest="login_master",
                          action="store_true",
                          default=False,
                          help="login to master node after launch")
        parser.add_option("-q",
                          "--disable-queue",
                          dest="disable_queue",
                          action="store_true",
                          default=None,
                          help="do not configure a queueing system (SGE)")
        parser.add_option("-Q",
                          "--enable-queue",
                          dest="disable_queue",
                          action="store_false",
                          default=None,
                          help="configure a queueing system (SGE) (default)")
        parser.add_option("--force-spot-master",
                          dest="force_spot_master",
                          action="store_true",
                          default=None,
                          help="when creating a spot cluster "
                          "the default is to launch the master as "
                          "a flat-rate instance for stability. this option "
                          "forces launching the master node as a spot "
                          "instance when a spot cluster is requested.")
        parser.add_option("--no-spot-master",
                          dest="force_spot_master",
                          action="store_false",
                          default=None,
                          help="Do not launch the master node as a spot "
                          "instance when a spot cluster is requested. "
                          "(default)")
        parser.add_option("--public-ips",
                          dest="public_ips",
                          default=None,
                          action='store_true',
                          help="Assign public IPs to all VPC nodes "
                          "(VPC clusters only)"),
        parser.add_option("--no-public-ips",
                          dest="public_ips",
                          default=None,
                          action='store_false',
                          help="Do NOT assign public ips to all VPC nodes "
                          "(VPC clusters only) (default)"),
        opt = parser.add_option("-c",
                                "--cluster-template",
                                action="store",
                                dest="cluster_template",
                                choices=templates,
                                default=None,
                                help="cluster template to use "
                                "from the config file")
        if completion:
            opt.completer = completion.ListCompleter(opt.choices)
        parser.add_option("-r",
                          "--refresh-interval",
                          dest="refresh_interval",
                          type="int",
                          action="callback",
                          default=None,
                          callback=self._positive_int,
                          help="refresh interval when waiting for cluster "
                          "nodes to come up (default: 30)")
        parser.add_option("-b",
                          "--bid",
                          dest="spot_bid",
                          action="store",
                          type="float",
                          default=None,
                          help="requests spot instances instead of flat "
                          "rate instances. Uses SPOT_BID as max bid for "
                          "the request.")
        parser.add_option("-d",
                          "--description",
                          dest="cluster_description",
                          action="store",
                          type="string",
                          default="Cluster requested at %s" %
                          time.strftime("%Y%m%d%H%M"),
                          help="brief description of cluster")
        parser.add_option("-s",
                          "--cluster-size",
                          dest="cluster_size",
                          action="callback",
                          type="int",
                          default=None,
                          callback=self._positive_int,
                          help="number of ec2 instances to launch")
        parser.add_option("-u",
                          "--cluster-user",
                          dest="cluster_user",
                          action="store",
                          type="string",
                          default=None,
                          help="name of user to create on cluster "
                          "(defaults to sgeadmin)")
        opt = parser.add_option("-S",
                                "--cluster-shell",
                                dest="cluster_shell",
                                action="store",
                                choices=static.AVAILABLE_SHELLS.keys(),
                                default=None,
                                help="shell for cluster user "
                                "(defaults to bash)")
        if completion:
            opt.completer = completion.ListCompleter(opt.choices)
        parser.add_option("-m",
                          "--master-image-id",
                          dest="master_image_id",
                          action="store",
                          type="string",
                          default=None,
                          help="AMI to use when launching master")
        parser.add_option("-n",
                          "--node-image-id",
                          dest="node_image_id",
                          action="store",
                          type="string",
                          default=None,
                          help="AMI to use when launching nodes")
        parser.add_option("-I",
                          "--master-instance-type",
                          dest="master_instance_type",
                          action="store",
                          choices=sorted(static.INSTANCE_TYPES.keys()),
                          default=None,
                          help="instance type for the master "
                          "instance")
        opt = parser.add_option("-i",
                                "--node-instance-type",
                                dest="node_instance_type",
                                action="store",
                                choices=sorted(static.INSTANCE_TYPES.keys()),
                                default=None,
                                help="instance type for the node instances")
        if completion:
            opt.completer = completion.ListCompleter(opt.choices)
        parser.add_option("-a",
                          "--availability-zone",
                          dest="availability_zone",
                          action="store",
                          type="string",
                          default=None,
                          help="availability zone to launch instances in")
        parser.add_option("-k",
                          "--keyname",
                          dest="keyname",
                          action="store",
                          type="string",
                          default=None,
                          help="name of the keypair to use when "
                          "launching the cluster")
        parser.add_option("-K",
                          "--key-location",
                          dest="key_location",
                          action="store",
                          type="string",
                          default=None,
                          metavar="FILE",
                          help="path to an ssh private key that matches the "
                          "cluster keypair")
        parser.add_option("-U",
                          "--userdata-script",
                          dest="userdata_scripts",
                          action="append",
                          default=None,
                          metavar="FILE",
                          help="Path to userdata script that will run on "
                          "each node on start-up. Can be used multiple times.")
        parser.add_option("-P",
                          "--dns-prefix",
                          dest="dns_prefix",
                          action='store_true',
                          help="Prefix dns names of all nodes in the cluster "
                          "with the cluster tag")
        parser.add_option("-p",
                          "--no-dns-prefix",
                          dest="dns_prefix",
                          action='store_false',
                          help="Do NOT prefix dns names of all nodes in the "
                          "cluster with the cluster tag (default)")
        # This option is disabled because we need to use nargs='+' which is
        # supported by argparse but not optparse. Use cluster template
        # configuration key SUBNET_IDS instead.
        # parser.add_option("-N", "--subnet-id", dest="subnet_id",
        #                   action="store", type="string",
        #                   help=("Launch cluster into a VPC subnet"))
        parser.add_option("--config-on-master",
                          default=False,
                          action='store_true',
                          help="Store the config on the "
                          "master node rather than into the security group "
                          "tags")
        parser.add_option("--dns-suffix",
                          action="store_true",
                          dest="dns_suffix",
                          help="Suffix dns names of all "
                          " nodes in the cluster with the cluster tag.")

    def execute(self, args):
        if len(args) != 1:
            self.parser.error("please specify a <cluster_tag>")
        tag = args[0]
        if tag.find("master") > -1:
            # Because of Node.is_master
            raise exception.ClusterValidationError("Cluster name cannot "
                                                   "contain master")

        create = not self.opts.no_create
        scluster = self.cm.get_cluster_group_or_none(tag)
        if scluster and create:
            scluster = self.cm.get_cluster(tag,
                                           group=scluster,
                                           load_receipt=False,
                                           require_keys=False)
            stopped_ebs = scluster.is_cluster_stopped()
            is_ebs = False
            if not stopped_ebs:
                is_ebs = scluster.is_ebs_cluster()
            raise exception.ClusterExists(tag,
                                          is_ebs=is_ebs,
                                          stopped_ebs=stopped_ebs)
        if not create and not scluster:
            raise exception.ClusterDoesNotExist(tag)
        create_only = self.opts.create_only
        validate = self.opts.validate
        validate_running = self.opts.no_create
        validate_only = self.opts.validate_only
        config_on_master = self.opts.config_on_master

        if scluster:
            if config_on_master:
                scluster = self.cm.get_cluster(tag,
                                               group=scluster,
                                               load_receipt=False)
                validate_running = False
            else:
                scluster = self.cm.get_cluster(tag, group=scluster)
                validate_running = True
        else:
            template = self.opts.cluster_template
            if not template:
                try:
                    template = self.cm.get_default_cluster_template()
                except exception.NoDefaultTemplateFound, e:
                    try:
                        ctmpl = e.options[0]
                    except IndexError:
                        ctmpl = "smallcluster"
                    e.msg += " \n\nAlternatively, you can specify a cluster "
                    e.msg += "template to use by passing the '-c' option to "
                    e.msg += "the 'start' command, e.g.:\n\n"
                    e.msg += "    $ starcluster start -c %s %s" % (ctmpl, tag)
                    raise e
                log.info("Using default cluster template: %s" % template)
            scluster = self.cm.get_cluster_template(template, tag)
        scluster.update(self.specified_options_dict, True)
        if self.opts.keyname and not self.opts.key_location:
            key = self.cfg.get_key(self.opts.keyname)
            scluster.key_location = key.key_location
        if not self.opts.refresh_interval:
            interval = self.cfg.globals.get("refresh_interval")
            if interval is not None:
                scluster.refresh_interval = interval
        if self.opts.spot_bid is not None and not self.opts.no_create:
            scluster.node_instance_array[0]['spot_bid'] = self.opts.spot_bid
            msg = user_msgs.spotmsg % {
                'size': scluster.cluster_size,
                'tag': tag
            }
            if not validate_only and not create_only:
                self.warn_experimental(msg, num_secs=5)
        if self.opts.dns_prefix:
            if tag.find(".") > -1:
                raise exception.ClusterValidationError(
                    "Cannot use --dns-prefix when the cluster tag contains "
                    "a dot.")
            scluster.dns_prefix = tag
        if self.opts.dns_suffix:
            scluster.dns_suffix = tag
        if config_on_master:
            scluster.config_on_master = True
            if self.opts.no_create:
                validate = False
                log.warning("Cannot start a cluster when its config is "
                            "stored on the master node using StarCluster. "
                            "You should start it manually and then use "
                            "the recovery options.")
                return
        try:
            scluster.start(create=create,
                           create_only=create_only,
                           validate=validate,
                           validate_only=validate_only,
                           validate_running=validate_running,
                           save_config_on_master=self.opts.config_on_master)
        except KeyboardInterrupt:
            if validate_only:
                raise
            else:
                raise exception.CancelledStartRequest(tag)
        if validate_only:
            return
        if not create_only and not self.opts.login_master:
            log.info(user_msgs.cluster_started_msg %
                     dict(tag=scluster.cluster_tag),
                     extra=dict(__textwrap__=True, __raw__=True))
        if self.opts.login_master:
            scluster.ssh_to_master()
Exemple #17
0
    def clean_cluster(self, nodes, master, user, user_shell, volumes):
        """
        Run qhost to find nodes that are present in OGS but not in the cluster
        in order to remove them.
        """
        self._master = master
        self._nodes = nodes

        qhost_xml = master.ssh.execute("qhost -xml", source_profile=True)
        qhost_et = ET.fromstringlist(qhost_xml)
        qhosts = []
        for host in qhost_et:
            h_name = host.attrib['name']
            if h_name != 'global':
                qhosts.append(h_name)

        if len(qhosts) == 0:
            log.info("Nothing to clean")

        alive_nodes = [node.alias for node in nodes]

        cleaned = []
        # find dead hosts
        for node_alias in qhosts:
            if node_alias not in alive_nodes:
                cleaned.append(node_alias)

        # find jobs running in dead hosts
        qstats_xml = self._master.ssh.execute("qstat -u \"*\" -xml",
                                              source_profile=True)
        qstats_xml[1:]  # remove first line
        qstats_et = ET.fromstringlist(qstats_xml)
        to_delete = []
        to_repair = []
        cleaned_queue = []  # not a lambda function to allow pickling
        for c in cleaned:
            cleaned_queue.append("all.q@" + c)
        for job_list in qstats_et.find("queue_info").findall("job_list"):
            if job_list.find("queue_name").text in cleaned_queue:
                job_number = job_list.find("JB_job_number").text
                to_delete.append(job_number)
        for job_list in qstats_et.find("job_info").findall("job_list"):
            if job_list.find("state").text == "Eqw":
                job_number = job_list.find("JB_job_number").text
                to_repair.append(job_number)
        # delete the jobs
        if to_delete:
            log.info("Stopping jobs: " + str(to_delete))
            self._master.ssh.execute("qdel -f " + " ".join(to_delete))
            time.sleep(3)  # otherwise might provoke LOST QRSH if on last job
        if to_repair:
            log.error("Reseting jobs: " + str(to_repair))
            self._master.ssh.execute("qmod -cj " + " ".join(to_repair),
                                     ignore_exit_status=True)

        # stuck qrsh issue
        ps_wc = int(self._master.ssh.execute("ps -ef | grep qrsh | wc -l")[0])
        qstat_wc = int(self._master.ssh.execute("qstat -u \"*\" | wc -l")[0])
        if qstat_wc == 0 and ps_wc > 2:
            log.error("LOST QRSH??")
            log.error("pkill -9 qrsh")
            self._master.ssh.execute("pkill -9 qrsh", ignore_exit_status=True)
        # ----------------------------------

        # delete the host config
        for c in cleaned:
            log.info("Cleaning node " + c)
            if len(master.ssh.get_remote_file_lines("/etc/hosts", c)) == 0:
                log.warn(c + " is missing from /etc/hosts, creating a dummy "
                         "entry 1.1.1.1")
                rfile = master.ssh.remote_file("/etc/hosts", 'a')
                rfile.write("1.1.1.1 " + c + "\n")
                rfile.close()

            try:
                self._remove_from_sge(DeadNode(c), only_clean_master=True)
            except RemoteCommandFailed:
                log.warning("Failed to remove node {} from sge."
                            .format(c), exc_info=True)

        # fix to allow pickling
        self._master = None
        self._nodes = None