Example #1
0
    def _connect(self):
        """
        Connects to the ec2 cloud provider
        """
        # check for existing connection
        if self._connection:
            return self._connection

        try:
            log.debug("Connecting to ec2 host %s", self._ec2host)
            region = ec2.regioninfo.RegionInfo(name=self._region_name,
                                               endpoint=self._ec2host)

            # connect to webservice
            self._connection = boto.connect_ec2(
                aws_access_key_id=self._access_key,
                aws_secret_access_key=self._secret_key,
                is_secure=self._secure,
                host=self._ec2host, port=self._ec2port,
                path=self._ec2path, region=region)

            # list images to see if the connection works
            log.debug("Connection has been successful.")
            # images = self._connection.get_all_images()
            # log.debug("%d images found on cloud %s",
            #           len(images), self._ec2host)

        except Exception as e:
            log.error("connection to cloud could not be "
                      "established: message=`%s`", str(e))
            raise

        return self._connection
Example #2
0
    def _allocate_address(self, instance):
        """Allocates a free public ip address to the given instance

        :param instance: instance to assign address to
        :type instance: py:class:`boto.ec2.instance.Reservation`

        :return: public ip address
        """
        connection = self._connect()
        free_addresses = [
            ip for ip in connection.get_all_addresses() if not ip.instance_id
        ]
        if not free_addresses:
            try:
                address = connection.allocate_address()
            except Exception as ex:
                log.error(
                    "Unable to allocate a public IP address to instance `%s`",
                    instance.id)
                return None

        try:
            address = free_addresses.pop()
            instance.use_ip(address)
            return address.public_ip
        except Exception as ex:
            log.error("Unable to associate IP address %s to instance `%s`",
                      address, instance.id)
            return None
Example #3
0
    def execute(self):
        """
        Starts a new cluster.
        """

        cluster_template = self.params.cluster
        if self.params.cluster_name:
            cluster_name = self.params.cluster_name
        else:
            cluster_name = self.params.cluster

        # First, check if the cluster is already created.
        try:
            cluster = Configurator().load_cluster(cluster_name)
        except ClusterNotFound, ex:
            if self.params.cluster_name:
                self.params.extra_conf['name'] = self.params.cluster_name

            try:
                cluster = Configurator().create_cluster(
                    cluster_template, **self.params.extra_conf)
            except ConfigurationError, ex:
                log.error("Starting cluster %s: %s\n",
                          cluster_template, ex)
                return
Example #4
0
    def _stop_all_nodes(self, wait=False):
        """
        Terminate all cluster nodes. Return number of failures.
        """
        failed = 0
        for node in self.get_all_nodes():
            if not node.instance_id:
                log.warning(
                    "Node `%s` has no instance ID."
                    " Assuming it did not start correctly,"
                    " so removing it anyway from the cluster.", node.name)
                self.nodes[node.kind].remove(node)
                continue
            # try and stop node
            try:
                # wait and pause for and recheck.
                node.stop(wait)

                self.nodes[node.kind].remove(node)
                log.debug(
                    "Removed node `%s` from cluster `%s`", node.name, self.name)
            except InstanceNotFoundError as err:
                log.info(
                    "Node `%s` (instance ID `%s`) was not found;"
                    " assuming it has already been terminated.",
                    node.name, node.instance_id)
            except Exception as err:
                failed += 1
                log.error(
                    "Could not stop node `%s` (instance ID `%s`): %s %s",
                    node.name, node.instance_id, err, err.__class__)
        return failed
Example #5
0
    def start_instance(self, key_name, public_key_path, private_key_path,
                       security_group, flavor, image_id, image_userdata,
                       username=None):
        """
        Starts an instance in the cloud on the specified cloud
        provider (configuration option) and returns the id of the
        started instance.
        """
        connection = self._connect()

        log.debug("Checking keypair `%s`.", key_name)
        self._check_keypair(key_name, public_key_path, private_key_path)
        log.debug("Checking security group `%s`.", security_group)
        self._check_security_group(security_group)
        # image_id = self._find_image_id(image_id)

        try:
            reservation = connection.run_instances(
                image_id, key_name=key_name, security_groups=[security_group],
                instance_type=flavor, user_data=image_userdata)
        except Exception, ex:
            log.error("Error starting instance: %s", ex)
            if "TooManyInstances" in ex:
                raise ClusterError(ex)
            else:
                raise InstanceError(ex)
Example #6
0
    def execute(self):
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
            cluster.update()
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Setting up cluster %s: %s", cluster_name, ex)
            return

        # XXX: the default value of `self.params.ssh_to` should = the
        # default value for `ssh_to` in `Cluster.get_ssh_to_node()`
        frontend = cluster.get_ssh_to_node(self.params.ssh_to)

        host = frontend.connection_ip()
        if not host:
            log.error("No IP address known for node %s", frontend.name)
            sys.exit(1)

        addr, port = parse_ip_address_and_port(host)
        username = frontend.image_user
        knownhostsfile = (cluster.known_hosts_file if cluster.known_hosts_file
                          else '/dev/null')
        sftp_cmdline = [
            "sftp",
            "-P", "{0:d}".format(port),
            "-o", "UserKnownHostsFile={0}".format(knownhostsfile),
            "-o", "StrictHostKeyChecking=yes",
            "-o", "IdentityFile={0}".format(frontend.user_key_private),
        ]
        sftp_cmdline.extend(self.params.sftp_args)
        sftp_cmdline.append('{0}@{1}'.format(username, addr))
        os.execlp("sftp", *sftp_cmdline)
Example #7
0
def _dereference_config_tree(tree, evict_on_error=True):
    """
    Modify `tree` in-place replacing cross-references by section name with the
    actual section content.

    For example, if a cluster section lists a key/value pair
    ``'login': '******'``, this will be replaced with ``'login': { ... }``.
    """
    to_evict = []
    for cluster_name, cluster_conf in tree['cluster'].iteritems():
        for key in ['cloud', 'login', 'setup']:
            refname = cluster_conf[key]
            if refname in tree[key]:
                # dereference
                cluster_conf[key] = tree[key][refname]
            else:
                log.error(
                    "Configuration section `cluster/%s`"
                    " references non-existing %s section `%s`."
                    " %s",
                    cluster_name, key, refname,
                    ("Dropping cluster definition." if evict_on_error else ""))
                if evict_on_error:
                    to_evict.append(cluster_name)
                    break
    for cluster_name in to_evict:
        del tree['cluster'][cluster_name]
    return tree
Example #8
0
 def sigint_handler(signal, frame):
     """
     Makes sure the cluster is stored, before the sigint results in
     exiting during the node startup.
     """
     log.error("user interruption: saving cluster before exit.")
     self.keep_running = False
Example #9
0
    def _allocate_address(self, instance):
        """Allocates a free public ip address to the given instance

        :param instance: instance to assign address to
        :type instance: py:class:`boto.ec2.instance.Reservation`

        :return: public ip address
        """
        connection = self._connect()
        free_addresses = [ ip for ip in connection.get_all_addresses() if not ip.instance_id]
        if not free_addresses:
            try:
                address = connection.allocate_address()
            except Exception as ex:
                log.error("Unable to allocate a public IP address to instance `%s`",
                          instance.id)
                return None

        try:
            address = free_addresses.pop()
            instance.use_ip(address)
            return address.public_ip
        except Exception as ex:
            log.error("Unable to associate IP address %s to instance `%s`",
                      address, instance.id)
            return None
Example #10
0
    def resume_instance(self, paused_info):
        """Restarts a paused instance, retaining disk and config.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance cannot be resumed.

        :return: dict - information needed to restart instance.
        """

        if not paused_info.get("instance_id"):
            log.info("Instance to stop has no instance id.")
            return

        gce = self._connect()

        try:
            request = gce.instances().start(
                project=self._project_id,
                instance=paused_info["instance_id"],
                zone=self._zone)
            operation = self._execute_request(request)
            response = self._wait_until_done(operation)
            self._check_response(response)
            return
        except HttpError as e:
            log.error("Error restarting instance: `%s", e)
            raise InstanceError("Error restarting instance `%s`", e)
Example #11
0
    def _allocate_address(self, instance):
        """Allocates a free public ip address to the given instance

        :param instance: instance to assign address to
        :type instance: py:class:`boto.ec2.instance.Reservation`

        :return: public ip address
        """
        connection = self._connect()
        addresses = connection.get_all_addresses()
        for address in addresses:
            # Find an unused address
            if not address.instance_id:
                # Free address, use it.
                instance.use_ip(address)
                log.debug("Assigning ip address `%s` to instance `%s`"
                          % (address.public_ip, instance.id))
                return address.public_ip

        # No allocated addresses available.
        try:
            address = connection.allocate_address()
            instance.use_ip(address)
            return address.public_ip
        except Exception, ex:
            log.error("Unable to allocate a public IP address to instance `%s`",
                      instance.id)
Example #12
0
    def execute(self):
        """
        Load the cluster and build a GC3Pie configuration snippet.
        """
        log.warning(
            "Command `elasticluster gc3pie-config` is DEPRECATED"
            " and will be removed in release 1.4 of ElastiCluster")
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Listing nodes from cluster %s: %s", cluster_name, ex)
            return

        from elasticluster.gc3pie_config import create_gc3pie_config_snippet

        if self.params.append:
            path = os.path.expanduser(self.params.append)
            try:
                fd = open(path, 'a')
                fd.write(create_gc3pie_config_snippet(cluster))
                fd.close()
            except IOError as ex:
                log.error("Unable to write configuration to file %s: %s",
                          path, ex)
        else:
            print(create_gc3pie_config_snippet(cluster))
Example #13
0
    def pause_instance(self, instance_id):
        """Pauses the instance, retaining disk and config.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance cannot be paused

        :return: dict - information needed to restart instance.
        """

        if not instance_id:
            log.info("Instance to pause has no instance id.")
            return

        gce = self._connect()

        try:
            request = gce.instances().stop(project=self._project_id,
                                           instance=instance_id,
                                           zone=self._zone)
            operation = self._execute_request(request)
            response = self._wait_until_done(operation)
            self._check_response(response)
            return {"instance_id": instance_id}
        except HttpError as e:
            log.error("Error stopping instance: `%s", e)
            raise InstanceError("Error stopping instance `%s`", e)
Example #14
0
    def resume_instance(self, paused_info):
        """Restarts a paused instance, retaining disk and config.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance cannot be resumed.

        :return: dict - information needed to restart instance.
        """

        if not paused_info.get("instance_id"):
            log.info("Instance to stop has no instance id.")
            return

        gce = self._connect()

        try:
            request = gce.instances().start(project=self._project_id,
                                            instance=paused_info["instance_id"],
                                            zone=self._zone)
            operation = self._execute_request(request)
            response = self._wait_until_done(operation)
            self._check_response(response)
            return
        except HttpError as e:
            log.error("Error restarting instance: `%s", e)
            raise InstanceError("Error restarting instance `%s`", e)
Example #15
0
    def _decrypt(self,text):
        cryptor = AES.new('1234567890123456',AES.MODE_CBC,b'0000000000000000')
        try:
	    plain_text = cryptor.decrypt(a2b_hex(text))
            return plain_text.rstrip('+')
        except TypeError,e:
            log.error("Your username/password seems not be encrypted:" + e.message)
Example #16
0
    def execute(self):
        configurator = Configurator.fromConfig(
            self.params.config, storage_path=self.params.storage)
        storage = configurator.create_cluster_storage()
        cluster_names = storage.get_stored_clusters()

        if not cluster_names:
            print("No clusters found.")
        else:
            print("""
The following clusters have been started.
Please note that there's no guarantee that they are fully configured:
""")
            for name in sorted(cluster_names):
                try:
                    cluster = configurator.load_cluster(name)
                except ConfigurationError, ex:
                    log.error("gettin information from cluster `%s`: %s", name,
                              ex)
                    continue
                print("%s " % name)
                print("-" * len(name))
                print("  name:           %s" % cluster.name)
                print("  template:       %s" % cluster.template)
                print("  cloud:          %s " % cluster._cloud)
                for cls in cluster.nodes:
                    print("  - %s nodes: %d" % (cls, len(cluster.nodes[cls])))
                print("")
Example #17
0
    def execute(self):
        configurator = Configurator.fromConfig(
            self.params.config, storage_path=self.params.storage)
        storage = configurator.create_cluster_storage()
        cluster_names = storage.get_stored_clusters()

        if not cluster_names:
            print("No clusters found.")
        else:
            print("""
The following clusters have been started.
Please note that there's no guarantee that they are fully configured:
""")
            for name in sorted(cluster_names):
                try:
                    cluster = configurator.load_cluster(name)
                except ConfigurationError, ex:
                    log.error("gettin information from cluster `%s`: %s",
                              name, ex)
                    continue
                print("%s " % name)
                print("-" * len(name))
                print("  name:           %s" % cluster.name)
                print("  template:       %s" % cluster.template)
                print("  cloud:          %s " % cluster._cloud)
                for cls in cluster.nodes:
                    print("  - %s nodes: %d" % (cls, len(cluster.nodes[cls])))
                print("")
Example #18
0
    def execute(self):
        """
        Starts a new cluster.
        """

        cluster_template = self.params.cluster
        if self.params.cluster_name:
            cluster_name = self.params.cluster_name
        else:
            cluster_name = self.params.cluster

        configurator = Configurator.fromConfig(
            self.params.config, storage_path=self.params.storage,
            include_config_dirs=True)

        # overwrite configuration
        for option, value in self.params.extra_conf.iteritems():
            cconf = configurator.cluster_conf[cluster_template]['cluster']
            if option in cconf:
                cconf[option] = value

        # First, check if the cluster is already created.
        try:
            cluster = configurator.load_cluster(cluster_name)
        except ClusterNotFound as e:
            try:
                cluster = configurator.create_cluster(
                    cluster_template, cluster_name)
            except ConfigurationError, e:
                log.error("Starting cluster %s: %s\n" % (cluster_template, e))
                return
Example #19
0
    def _start_node(node):
        """Static method to start a specific node on a cloud

        :return: bool -- True on success, False otherwise
        """
        log.debug("_start_node: working on node %s" % node.name)
        # TODO: the following check is not optimal yet. When a
        # node is still in a starting state,
        # it will start another node here,
        # since the `is_alive` method will only check for
        # running nodes (see issue #13)
        if node.is_alive():
            log.info("Not starting node %s which is "
                     "already up&running.", node.name)
            return True
        else:
            try:
                node.start()
                log.info("_start_node: node has been started")
                return True
            except KeypairError as e:
                return e
            except Exception as e:
                log.error("could not start node `%s` for reason "
                          "`%s`" % (node.name, e))
                return None
Example #20
0
def cluster_summary(cluster):
    try:
        frontend = cluster.get_frontend_node().name
    except NodeNotFound, ex:
        frontend = 'unknown'
        log.error("Unable to get information on the frontend node: "
                  "%s", str(ex))
Example #21
0
    def execute(self):

        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        config = creator.cluster_conf

        print("""%d cluster templates found in configuration file.""" %
              len(config))
        templates = config.keys()
        for pattern in self.params.clusters:
            templates = [t for t in templates if fnmatch(t, pattern)]

        if self.params.clusters:
            print("""%d cluter templates found matching pattern(s) '%s'""" %
                  (len(templates), str.join(", ", self.params.clusters)))

        for template in templates:
            try:
                cluster = creator.create_cluster(template, template)
                print("""
name:     %s""" % template)
                for nodekind in cluster.nodes:
                    print("%s nodes: %d" %
                          (nodekind, len(cluster.nodes[nodekind])))
            except ConfigurationError as ex:
                log.error("unable to load cluster `%s`: %s", template, ex)
Example #22
0
    def pause_instance(self, instance_id):
        """Pauses the instance, retaining disk and config.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance cannot be paused

        :return: dict - information needed to restart instance.
        """

        if not instance_id:
            log.info("Instance to pause has no instance id.")
            return

        gce = self._connect()

        try:
            request = gce.instances().stop(project=self._project_id,
                                           instance=instance_id,
                                           zone=self._zone)
            operation = self._execute_request(request)
            response = self._wait_until_done(operation)
            self._check_response(response)
            return {"instance_id": instance_id}
        except HttpError as e:
            log.error("Error stopping instance: `%s", e)
            raise InstanceError("Error stopping instance `%s`", e)
Example #23
0
    def _connect(self):
        """Connects to the ec2 cloud provider

        :return: :py:class:`boto.ec2.connection.EC2Connection`
        :raises: Generic exception on error
        """
        # check for existing connection
        if self._connection:
            return self._connection

        try:
            log.debug("Connecting to ec2 host %s", self._ec2host)
            region = ec2.regioninfo.RegionInfo(name=self._region_name,
                                               endpoint=self._ec2host)

            # connect to webservice
            self._connection = boto.connect_ec2(
                aws_access_key_id=self._access_key,
                aws_secret_access_key=self._secret_key,
                is_secure=self._secure,
                host=self._ec2host, port=self._ec2port,
                path=self._ec2path, region=region)

            # list images to see if the connection works
            log.debug("Connection has been successful.")
            # images = self._connection.get_all_images()
            # log.debug("%d images found on cloud %s",
            #           len(images), self._ec2host)

        except Exception as e:
            log.error("connection to cloud could not be "
                      "established: message=`%s`", str(e))
            raise

        return self._connection
Example #24
0
 def __init_keystone_session_v2(self, check=False):
     """Create and return a session object using Keystone API v2."""
     from keystoneauth1 import loading as keystone_v2
     loader = keystone_v2.get_plugin_loader('password')
     auth = loader.load_from_options(
         auth_url=self._os_auth_url,
         username=self._os_username,
         password=self._os_password,
         project_name=self._os_tenant_name,
     )
     sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
     if check:
         log.debug("Checking that Keystone API v2 session works...")
         try:
             # if session is invalid, the following will raise some exception
             nova = nova_client.Client(self._compute_api_version, session=sess, cacert=self._os_cacert)
             nova.flavors.list()
         except keystoneauth1.exceptions.NotFound as err:
             log.warning("Creating Keystone v2 session failed: %s", err)
             return None
         except keystoneauth1.exceptions.ClientException as err:
             log.error("OpenStack server rejected request (likely configuration error?): %s", err)
             return None  # FIXME: should we be raising an error instead?
     # if we got to this point, v2 session is valid
     log.info("Using Keystone API v2 session to authenticate to OpenStack")
     return sess
Example #25
0
        def start_node(node_queue):
            try:
                while not node_queue.empty():
                    if not self.keep_running:
                        log.error("Aborting execution upon CTRL-C")
                        break
                    node = node_queue.get()
                    # TODO: the following check is not optimal yet. When a
                    # node is still in a starting state,
                    # it will start another node here,
                    # since the `is_alive` method will only check for
                    # running nodes (see issue #13)
                    if node.is_alive():
                        log.info("Not starting node %s which is "
                                 "already up&running.", node.name)
                    else:
                        log.info("starting node...")
                        try:
                            node.start()
                        except (InstanceError, SecurityGroupError,
                        KeypairError, ImageError) as e:
                            log.error("could not start node `%s` for reason "
                                      "`%s`" % (node.name, e))

            except Empty:
                # nothing to do if the queue turns out to be empty - the
                # nodes are then already started.
                pass
Example #26
0
    def stop(self, force=False):
        """Destroys all instances of this cluster and calls delete on the
        repository.

        :param bool force: force termination of instances in any case
        """
        for node in self.get_all_nodes():
            if node.instance_id:
                try:
                    node.stop()
                    self.nodes[node.kind].remove(node)
                    log.debug("Removed node with instance id %s from %s"
                              % (node.instance_id, node.kind))
                except:
                    # Boto does not always raises an `Exception` class!
                    log.error("could not stop instance `%s`, it might "
                              "already be down.", node.instance_id)
            else:
                log.debug("Not stopping node with no instance id. It seems "
                          "like node `%s` did not start correctly."
                          % node.name)
                self.nodes[node.kind].remove(node)
        if not self.get_all_nodes():
            log.debug("Removing cluster %s.", self.name)
            self._setup_provider.cleanup(self)
            self.repository.delete(self)
        elif not force:
            log.warning("Not all instances have been terminated. "
                        "Please rerun the `elasticluster stop %s`", self.name)
            self.repository.save_or_update(self)
        else:
            log.warning("Not all instances have been terminated. However, "
                        "as requested, the cluster has been force-removed.")
            self._setup_provider.cleanup(self)
            self.repository.delete(self)
Example #27
0
 def stop(self, force=False):
     """
     Terminates all instances corresponding to this cluster and
     deletes the cluster storage.
     """
     for node in self.get_all_nodes():
         if node.instance_id:
             try:
                 node.stop()
                 self.nodes[node.type].remove(node)
                 log.debug("Removed node with instance id %s from %s"
                           % (node.instance_id, node.type))
             except:
                 # Boto does not always raises an `Exception` class!
                 log.error("could not stop instance `%s`, it might "
                           "already be down.", node.instance_id)
         else:
             log.debug("Not stopping node with no instance id. It seems "
                       "like node `%s` did not start correctly."
                       % node.name)
             self.nodes[node.type].remove(node)
     if not self.get_all_nodes():
         log.debug("Removing cluster %s.", self.name)
         self._setup_provider.cleanup()
         self._storage.delete_cluster(self.name)
     elif not force:
         log.warning("Not all instances have been terminated. "
                     "Please rerun the `elasticluster stop %s`", self.name)
         self._storage.dump_cluster(self)
     else:
         log.warning("Not all instances have been terminated. However, "
                     "as requested, the cluster has been force-removed.")
         self._setup_provider.cleanup()
         self._storage.delete_cluster(self.name)
Example #28
0
def _validate_and_convert(cfgtree, evict_on_error=True):
    objtree = {}
    for section, model in SCHEMA.iteritems():
        if section not in cfgtree:
            continue
        stanzas = cfgtree[section]
        objtree[section] = {}
        for name, properties in stanzas.iteritems():
            log.debug("Checking section `%s/%s` ...", section, name)
            try:
                objtree[section][name] = Schema(model).validate(properties)
                # further checks for cloud providers
                if section == 'cloud':
                    objtree[section][name] = _validate_cloud_section(objtree[section][name])
                # check node name pattern in clusters conforms to RFC952
                if section == 'cluster':
                    _validate_node_group_names(objtree[section][name])
            except (SchemaError, ValueError) as err:
                log.error("In section `%s/%s`: %s", section, name, err)
                if evict_on_error:
                    log.error(
                        "Dropping configuration section `%s/%s`"
                        " because of the above errors", section, name)
                    # `objtree[section][name]` exists if the except was raised
                    # by the second validation (line 650)
                    if name in objtree[section]:
                        del objtree[section][name]
    return objtree
Example #29
0
def cluster_summary(cluster):
    try:
        frontend = cluster.get_ssh_to_node().name
    except NodeNotFound as ex:
        frontend = 'unknown'
        log.error("Unable to get information on the frontend node: %s", ex)
    msg = """
Cluster name:     %s
Cluster template: %s
Default ssh to node: %s
""" % (cluster.name, cluster.template, frontend)

    for cls in cluster.nodes:
        msg += "- %s nodes: %d\n" % (cls, len(cluster.nodes[cls]))
    msg += """
To login on the frontend node, run the command:

    hwcc ssh %s

To upload or download files to the cluster, use the command:

    hwcc sftp %s

To enable slurm power saving options, use the steps:
    
    1:check the config file, make sure the value of global_var_slurm_suspendtime is not -1
    2:run the script: sh Initslurm.sh
""" % (cluster.name, cluster.name)
    return msg
Example #30
0
def inspect_node(node):
    """
    This function accept a `elasticluster.cluster.Node` class,
    connects to a node and tries to discover the kind of batch system
    installed, and some other information.
    """
    node_information = {}
    ssh = node.connect()
    if not ssh:
        log.error("Unable to connect to node %s", node.name)
        return

    (_in, _out, _err) = ssh.exec_command("(type >& /dev/null -a srun && echo slurm) \
                      || (type >& /dev/null -a qconf && echo sge) \
                      || (type >& /dev/null -a pbsnodes && echo pbs) \
                      || echo UNKNOWN")
    node_information['type'] = _out.read().strip()

    (_in, _out, _err) = ssh.exec_command("arch")
    node_information['architecture'] = _out.read().strip()

    if node_information['type'] == 'slurm':
        inspect_slurm_cluster(ssh, node_information)
    elif node_information['type'] == 'sge':
        inspect_sge_cluster(ssh, node_information)
    ssh.close()
    return node_information
Example #31
0
def inspect_node(node):
    """
    This function accept a `elasticluster.cluster.Node` class,
    connects to a node and tries to discover the kind of batch system
    installed, and some other information.
    """
    node_information = {}
    ssh = node.connect()
    if not ssh:
        log.error("Unable to connect to node %s", node.name)
        return

    (_in, _out,
     _err) = ssh.exec_command("(type >& /dev/null -a srun && echo slurm) \
                      || (type >& /dev/null -a qconf && echo sge) \
                      || (type >& /dev/null -a pbsnodes && echo pbs) \
                      || echo UNKNOWN")
    node_information['type'] = _out.read().strip()

    (_in, _out, _err) = ssh.exec_command("arch")
    node_information['architecture'] = _out.read().strip()

    if node_information['type'] == 'slurm':
        inspect_slurm_cluster(ssh, node_information)
    elif node_information['type'] == 'sge':
        inspect_sge_cluster(ssh, node_information)
    ssh.close()
    return node_information
Example #32
0
    def setup(self, extra_args=tuple()):
        """
        Configure the cluster nodes.

        Actual action is delegated to the
        :py:class:`elasticluster.providers.AbstractSetupProvider` that
        was provided at construction time.

        :param list extra_args:
          List of additional command-line arguments
          that are appended to each invocation of the setup program.

        :return: bool - True on success, False otherwise
        """
        try:
            # setup the cluster using the setup provider
            ret = self._setup_provider.setup_cluster(self, extra_args)
        except Exception as err:
            log.error(
                "The cluster hosts are up and running,"
                " but %s failed to set the cluster up: %s",
                self._setup_provider.HUMAN_READABLE_NAME, err)
            ret = False

        if not ret:
            log.warning(
                "Cluster `%s` not yet configured. Please, re-run "
                "`elasticluster setup %s` and/or check your configuration",
                self.name, self.name)

        return ret
Example #33
0
    def execute(self):
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
            cluster.update()
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Setting up cluster %s: %s\n" %
                      (cluster_name, ex))
            return

        if self.params.ssh_to:
            try:
                nodes = dict((n.name,n) for n in cluster.get_all_nodes())
                frontend = nodes[self.params.ssh_to]
            except KeyError:
                raise ValueError(
                    "Hostname %s not found in cluster %s" % (self.params.ssh_to, cluster_name))
        else:
            frontend = cluster.get_frontend_node()
        host = f
        rontend.connection_ip()
        username = frontend.image_user
        knownhostsfile = cluster.known_hosts_file if cluster.known_hosts_file \
                         else '/dev/null'
        sftp_cmdline = ["sftp",
                        "-o", "UserKnownHostsFile=%s" % knownhostsfile,
                        "-o", "StrictHostKeyChecking=yes",
                        "-o", "IdentityFile=%s" % frontend.user_key_private]
        sftp_cmdline.extend(self.params.sftp_args)
        sftp_cmdline.append('%s@%s' % (username, host))
        os.execlp("sftp", *sftp_cmdline)
Example #34
0
    def execute(self):
        """
        Lists all nodes within the specified cluster with certain
        information like id and ip.
        """
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
            if self.params.update:
                cluster.update()
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Listing nodes from cluster %s: %s", cluster_name, ex)
            return

        if self.params.pretty_json:
            print(json.dumps(cluster, default=dict, indent=4))
        elif self.params.json:
            print(json.dumps(cluster, default=dict))
        else:
            print(cluster_summary(cluster))
            for cls in cluster.nodes:
                print("%s nodes:" % cls)
                print("")
                for node in cluster.nodes[cls]:
                    txt = ["    " + i for i in node.pprint().splitlines()]
                    print('  - ' + "\n".join(txt)[4:])
                    print("")
Example #35
0
    def _stop_all_nodes(self, wait=False):
        """
        Terminate all cluster nodes. Return number of failures.
        """
        failed = 0
        for node in self.get_all_nodes():
            if not node.instance_id:
                log.warning(
                    "Node `%s` has no instance ID."
                    " Assuming it did not start correctly,"
                    " so removing it anyway from the cluster.", node.name)
                self.nodes[node.kind].remove(node)
                continue
            # try and stop node
            try:
                # wait and pause for and recheck.
                node.stop(wait)

                self.nodes[node.kind].remove(node)
                log.debug(
                    "Removed node `%s` from cluster `%s`", node.name, self.name)
            except InstanceNotFoundError as err:
                log.info(
                    "Node `%s` (instance ID `%s`) was not found;"
                    " assuming it has already been terminated.",
                    node.name, node.instance_id)
            except Exception as err:
                failed += 1
                log.error(
                    "Could not stop node `%s` (instance ID `%s`): %s %s",
                    node.name, node.instance_id, err, err.__class__)
        return failed
Example #36
0
    def execute(self):
        """
        Stops the cluster if it's running.
        """
        cluster_name = self.params.cluster
        configurator = get_configurator(self.params.config,
                                        storage_path=self.params.storage,
                                        include_config_dirs=True)
        try:
            cluster = configurator.load_cluster(cluster_name)
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Stopping cluster %s: %s\n" %
                      (cluster_name, ex))
            return

        if not self.params.yes:
            # Ask for confirmation
            yesno = raw_input(
                "Do you want really want to stop "
                "cluster %s? [yN] " % cluster_name)
            if yesno.lower() not in ['yes', 'y']:
                print("Aborting as per user request.")
                sys.exit(0)
        print("Destroying cluster `%s`" % cluster_name)
        cluster.stop(force=self.params.force)
Example #37
0
 def __init_keystone_session_v2(self, check=False):
     """Create and return a session object using Keystone API v2."""
     from keystoneauth1 import loading as keystone_v2
     loader = keystone_v2.get_plugin_loader('password')
     auth = loader.load_from_options(
         auth_url=self._os_auth_url,
         username=self._os_username,
         password=self._os_password,
         project_name=self._os_tenant_name,
     )
     sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
     if check:
         log.debug("Checking that Keystone API v2 session works...")
         try:
             # if session is invalid, the following will raise some exception
             nova = nova_client.Client(self._compute_api_version,
                                       session=sess,
                                       cacert=self._os_cacert)
             nova.flavors.list()
         except keystoneauth1.exceptions.NotFound as err:
             log.warning("Creating Keystone v2 session failed: %s", err)
             return None
         except keystoneauth1.exceptions.ClientException as err:
             log.error(
                 "OpenStack server rejected request (likely configuration error?): %s",
                 err)
             return None  # FIXME: should we be raising an error instead?
     # if we got to this point, v2 session is valid
     log.info("Using Keystone API v2 session to authenticate to OpenStack")
     return sess
Example #38
0
    def execute(self):
        """
        Starts a new cluster.
        """

        cluster_template = self.params.cluster
        if self.params.cluster_name:
            cluster_name = self.params.cluster_name
        else:
            cluster_name = self.params.cluster

        configurator = Configurator.fromConfig(
            self.params.config, storage_path=self.params.storage)

        # overwrite configuration
        for option, value in self.params.extra_conf.iteritems():
            cconf = configurator.cluster_conf[cluster_template]['cluster']
            if option in cconf:
                cconf[option] = value

        # First, check if the cluster is already created.
        try:
            cluster = configurator.load_cluster(cluster_name)
        except ClusterNotFound as e:
            try:
                cluster = configurator.create_cluster(
                    cluster_template, cluster_name)
            except ConfigurationError, e:
                log.error("Starting cluster %s: %s\n" % (cluster_template, e))
                return
Example #39
0
def cluster_summary(cluster):
    try:
        frontend = cluster.get_frontend_node().name
    except NodeNotFound, ex:
        frontend = 'unknown'
        log.error("Unable to get information on the frontend node: "
                  "%s", str(ex))
Example #40
0
    def setup(self):
        """Configure the cluster nodes with the specified  This
        is delegated to the provided :py:class:`elasticluster.providers.AbstractSetupProvider`

        :return: bool - True on success, False otherwise
        """
        try:
            # setup the cluster using the setup provider
            ret = self._setup_provider.setup_cluster(self)
        except Exception as e:
            log.error(
                "the setup provider was not able to setup the cluster, "
                "but the cluster is running by now. Setup provider error "
                "message: `%s`",
                str(e),
            )
            ret = False

        if not ret:
            log.warning(
                "Cluster `%s` not yet configured. Please, re-run "
                "`elasticluster setup %s` and/or check your configuration",
                self.name,
                self.name,
            )

        return ret
Example #41
0
def cluster_summary(cluster):
    try:
        frontend = cluster.get_frontend_node().name
    except NodeNotFound as ex:
        frontend = 'unknown'
        log.error("Unable to get information on the frontend node: "
                  "%s", str(ex))
    msg = """
Cluster name:     %s
Cluster template: %s
Default ssh to node: %s
""" % (cluster.name, cluster.template, frontend)

    for cls in cluster.nodes:
        msg += "- %s nodes: %d\n" % (cls, len(cluster.nodes[cls]))
    msg += """
To login on the frontend node, run the command:

    elasticluster ssh %s

To upload or download files to the cluster, use the command:

    elasticluster sftp %s
""" % (cluster.name, cluster.name)
    return msg
Example #42
0
    def execute(self):
        """
        Load the cluster and build a GC3Pie configuration snippet.
        """
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Listing nodes from cluster %s: %s\n" %
                      (cluster_name, ex))
            return

        from elasticluster.gc3pie_config import create_gc3pie_config_snippet

        if self.params.append:
            path = os.path.expanduser(self.params.append)
            try:
                fd = open(path, 'a')
                fd.write(create_gc3pie_config_snippet(cluster))
                fd.close()
            except IOError as ex:
                log.error("Unable to write configuration to file %s: %s",
                          path, ex)
        else:
            print(create_gc3pie_config_snippet(cluster))
Example #43
0
    def execute(self):
        """
        Lists all nodes within the specified cluster with certain
        information like id and ip.
        """
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
            if self.params.update:
                cluster.update()
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Listing nodes from cluster %s: %s\n" %
                      (cluster_name, ex))
            return

        if self.params.pretty_json:
            print(json.dumps(cluster, default=dict, indent=4))
        elif self.params.json:
            print(json.dumps(cluster, default=dict))
        else:
            print(cluster_summary(cluster))
            for cls in cluster.nodes:
                print("%s nodes:" % cls)
                print("")
                for node in cluster.nodes[cls]:
                    txt = ["    " + i for i in node.pprint().splitlines()]
                    print('  - ' + str.join("\n", txt)[4:])
                    print("")
Example #44
0
    def execute(self):
        """
        Load the cluster and build a GC3Pie configuration snippet.
        """
        log.warning("Command `elasticluster gc3pie-config` is DEPRECATED"
                    " and will be removed in release 1.4 of ElastiCluster")
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Listing nodes from cluster %s: %s", cluster_name, ex)
            return

        from elasticluster.gc3pie_config import create_gc3pie_config_snippet

        if self.params.append:
            path = os.path.expanduser(self.params.append)
            try:
                fd = open(path, 'a')
                fd.write(create_gc3pie_config_snippet(cluster))
                fd.close()
            except IOError as ex:
                log.error("Unable to write configuration to file %s: %s", path,
                          ex)
        else:
            print(create_gc3pie_config_snippet(cluster))
Example #45
0
def _validate_and_convert(cfgtree, evict_on_error=True):
    objtree = {}
    for section, model in SCHEMA.iteritems():
        if section not in cfgtree:
            continue
        stanzas = cfgtree[section]
        objtree[section] = {}
        for name, properties in stanzas.iteritems():
            log.debug("Checking section `%s/%s` ...", section, name)
            try:
                objtree[section][name] = Schema(model).validate(properties)
                # further checks for cloud providers
                if section == 'cloud':
                    objtree[section][name] = _validate_cloud_section(
                        objtree[section][name])
                # check node name pattern in clusters conforms to RFC952
                if section == 'cluster':
                    _validate_node_group_names(objtree[section][name])
            except (SchemaError, ValueError) as err:
                log.error("In section `%s/%s`: %s", section, name, err)
                if evict_on_error:
                    log.error(
                        "Dropping configuration section `%s/%s`"
                        " because of the above errors", section, name)
                    # `objtree[section][name]` exists if the except was raised
                    # by the second validation (line 650)
                    if name in objtree[section]:
                        del objtree[section][name]
    return objtree
Example #46
0
    def setup(self, extra_args=tuple()):
        """
        Configure the cluster nodes.

        Actual action is delegated to the
        :py:class:`elasticluster.providers.AbstractSetupProvider` that
        was provided at construction time.

        :param list extra_args:
          List of additional command-line arguments
          that are appended to each invocation of the setup program.

        :return: bool - True on success, False otherwise
        """
        try:
            # setup the cluster using the setup provider
            ret = self._setup_provider.setup_cluster(self, extra_args)
        except Exception as err:
            log.error(
                "The cluster hosts are up and running,"
                " but %s failed to set the cluster up: %s",
                self._setup_provider.HUMAN_READABLE_NAME, err)
            ret = False

        if not ret:
            log.warning(
                "Cluster `%s` not yet configured. Please, re-run "
                "`elasticluster setup %s` and/or check your configuration",
                self.name, self.name)

        return ret
Example #47
0
    def execute(self):

        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        config = creator.cluster_conf

        print("""%d cluster templates found in configuration file.""" % len(config))
        templates = config.keys()
        for pattern in self.params.clusters:
            templates = [t for t in templates if fnmatch(t, pattern)]

        if self.params.clusters:
            print("""%d cluter templates found matching pattern(s) '%s'""" % (len(templates), str.join(", ", self.params.clusters)))

        for template in templates:
            try:
                cluster = creator.create_cluster(template, template)
                print("""
name:     %s""" % template)
                for nodekind in cluster.nodes:
                    print("%s nodes: %d" % (
                        nodekind,
                        len(cluster.nodes[nodekind])))
            except ConfigurationError as ex:
                log.error("unable to load cluster `%s`: %s", template, ex)
Example #48
0
 def sigint_handler(signal, frame):
     """
     Makes sure the cluster is stored, before the sigint results in
     exiting during the node startup.
     """
     log.error("user interruption: saving cluster before exit.")
     self.keep_running = False
Example #49
0
def cluster_summary(cluster):
    try:
        frontend = cluster.get_frontend_node().name
    except NodeNotFound as ex:
        frontend = 'unknown'
        log.error("Unable to get information on the frontend node: "
                  "%s", str(ex))
    msg = """
Cluster name:     %s
Cluster template: %s
Default ssh to node: %s
""" % (cluster.name, cluster.template, frontend)

    for cls in cluster.nodes:
        msg += "- %s nodes: %d\n" % (cls, len(cluster.nodes[cls]))
    msg += """
To login on the frontend node, run the command:

    elasticluster ssh %s

To upload or download files to the cluster, use the command:

    elasticluster sftp %s
""" % (cluster.name, cluster.name)
    return msg
Example #50
0
    def execute(self):
        """
        Load the cluster and build a GC3Pie configuration snippet.
        """
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Listing nodes from cluster %s: %s\n" %
                      (cluster_name, ex))
            return

        from elasticluster.gc3pie_config import create_gc3pie_config_snippet

        if self.params.append:
            path = os.path.expanduser(self.params.append)
            try:
                fd = open(path, 'a')
                fd.write(create_gc3pie_config_snippet(cluster))
                fd.close()
            except IOError as ex:
                log.error("Unable to write configuration to file %s: %s",
                          path, ex)
        else:
            print(create_gc3pie_config_snippet(cluster))
Example #51
0
    def execute(self):
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
            cluster.update()
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Setting up cluster %s: %s\n" %
                      (cluster_name, ex))
            return

        if self.params.ssh_to:
            try:
                nodes = dict((n.name,n) for n in cluster.get_all_nodes())
                frontend = nodes[self.params.ssh_to]
            except KeyError:
                raise ValueError(
                    "Hostname %s not found in cluster %s" % (self.params.ssh_to, cluster_name))
        else:
            frontend = cluster.get_frontend_node()
        host = frontend.connection_ip()
        username = frontend.image_user
        knownhostsfile = cluster.known_hosts_file if cluster.known_hosts_file \
                         else '/dev/null'
        sftp_cmdline = ["sftp",
                        "-o", "UserKnownHostsFile=%s" % knownhostsfile,
                        "-o", "StrictHostKeyChecking=yes",
                        "-o", "IdentityFile=%s" % frontend.user_key_private]
        sftp_cmdline.extend(self.params.sftp_args)
        sftp_cmdline.append('%s@%s' % (username, host))
        os.execlp("sftp", *sftp_cmdline)
Example #52
0
    def execute(self):
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
            cluster.update()
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Setting up cluster %s: %s\n" %
                      (cluster_name, ex))
            return

        if self.params.ssh_to:
            try:
                nodes = dict((n.name,n) for n in cluster.get_all_nodes())
                frontend = nodes[self.params.ssh_to]
            except KeyError:
                raise ValueError(
                    "Hostname %s not found in cluster %s" % (self.params.ssh_to, cluster_name))
        else:
            frontend = cluster.get_frontend_node()
        try:
            # ensure we can connect to the host
            if not frontend.preferred_ip:
                # Ensure we can connect to the node, and save the value of `preferred_ip`

                ssh = frontend.connect(keyfile=cluster.known_hosts_file)
                if ssh:
                    ssh.close()
                cluster.repository.save_or_update(cluster)

        except NodeNotFound as ex:
            log.error("Unable to connect to the frontend node: %s" % str(ex))
            sys.exit(1)
        host = frontend.connection_ip()

        # check for nonstandard port, either IPv4 or IPv6
        addr = host
        port = str(SSH_PORT)
        if ':' in host:
            match = IPV6_RE.match(host)
            if match:
                addr = match.groups()[0]
                port = match.groups()[1]
            else:
                addr, _, port = host.partition(':')

        username = frontend.image_user
        knownhostsfile = cluster.known_hosts_file if cluster.known_hosts_file \
                         else '/dev/null'
        ssh_cmdline = ["ssh",
                       "-i", frontend.user_key_private,
                       "-o", "UserKnownHostsFile=%s" % knownhostsfile,
                       "-o", "StrictHostKeyChecking=yes",
                       "-p", port,
                       '%s@%s' % (username, addr)]
        ssh_cmdline.extend(self.params.ssh_args)
        log.debug("Running command `%s`" % str.join(' ', ssh_cmdline))
        os.execlp("ssh", *ssh_cmdline)
Example #53
0
    def start_instance(self,
                       key_name,
                       public_key_path,
                       private_key_path,
                       security_group,
                       flavor,
                       image_id,
                       image_userdata,
                       username=None,
                       **kwargs):
        """Starts a new instance on the cloud using the given properties.
        The following tasks are done to start an instance:

        * establish a connection to the cloud web service
        * check ssh keypair and upload it if it does not yet exist. This is
          a locked process, since this function might be called in multiple
          threads and we only want the key to be stored once.
        * check if the security group exists
        * run the instance with the given properties

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None

        :return: str - instance id of the started instance
        """
        connection = self._connect()

        log.debug("Checking keypair `%s`.", key_name)
        # the `_check_keypair` method has to be called within a lock,
        # since it will upload the key if it does not exist and if this
        # happens for every node at the same time ec2 will throw an error
        # message (see issue #79)
        with BotoCloudProvider.__node_start_lock:
            self._check_keypair(key_name, public_key_path, private_key_path)

        log.debug("Checking security group `%s`.", security_group)
        self._check_security_group(security_group)
        # image_id = self._find_image_id(image_id)

        try:
            reservation = connection.run_instances(
                image_id,
                key_name=key_name,
                security_groups=[security_group],
                instance_type=flavor,
                user_data=image_userdata)
        except Exception, ex:
            log.error("Error starting instance: %s", ex)
            if "TooManyInstances" in ex:
                raise ClusterError(ex)
            else:
                raise InstanceError(ex)
Example #54
0
    def execute(self):
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
            cluster.update()
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Setting up cluster %s: %s\n" %
                      (cluster_name, ex))
            return

        if self.params.ssh_to:
            try:
                nodes = dict((n.name,n) for n in cluster.get_all_nodes())
                frontend = nodes[self.params.ssh_to]
            except KeyError:
                raise ValueError(
                    "Hostname %s not found in cluster %s" % (self.params.ssh_to, cluster_name))
        else:
            frontend = cluster.get_frontend_node()
        try:
            # ensure we can connect to the host
            if not frontend.preferred_ip:
                # Ensure we can connect to the node, and save the value of `preferred_ip`

                ssh = frontend.connect(keyfile=cluster.known_hosts_file)
                if ssh:
                    ssh.close()
                cluster.repository.save_or_update(cluster)

        except NodeNotFound as ex:
            log.error("Unable to connect to the frontend node: %s" % str(ex))
            sys.exit(1)
        host = frontend.connection_ip()

        # check for nonstandard port, either IPv4 or IPv6
        addr = host
        port = str(SSH_PORT)
        if ':' in host:
            match = IPV6_RE.match(host)
            if match:
                addr = match.groups()[0]
                port = match.groups()[1]
            else:
                addr, _, port = host.partition(':')

        username = frontend.image_user
        knownhostsfile = cluster.known_hosts_file if cluster.known_hosts_file \
                         else '/dev/null'
        ssh_cmdline = ["ssh",
                       "-i", frontend.user_key_private,
                       "-o", "UserKnownHostsFile=%s" % knownhostsfile,
                       "-o", "StrictHostKeyChecking=yes",
                       "-p", port,
                       '%s@%s' % (username, addr)]
        ssh_cmdline.extend(self.params.ssh_args)
        log.debug("Running command `%s`" % str.join(' ', ssh_cmdline))
        os.execlp("ssh", *ssh_cmdline)
 def sigint_handler(signal, frame):
     """
     Makes sure the cluster is saved, before the sigint results in
     exiting during node startup.
     """
     log.error("Interrupted: will save cluster state and exit"
               " after all nodes have started.")
     keep_running = False
Example #56
0
def _cross_validate_final_config(objtree, evict_on_error=True):
    """
    Run validation checks that require correlating values from different sections.
    """
    # take a copy of cluster config as we might be modifying it
    for name, cluster in list(objtree['cluster'].items()):
        valid = True
        # ensure all cluster node kinds are defined in the `setup/*` section
        setup_sect = cluster['setup']
        for groupname, properties in cluster['nodes'].items():
            if (groupname + '_groups') not in setup_sect:
                log.error(
                    "Cluster `%s` requires nodes of kind `%s`,"
                    " but no such group is defined"
                    " in the referenced setup section.", name, groupname)
                valid = False
                break

        # ensure `ssh_to` has a valid value
        if 'ssh_to' in cluster:
            ssh_to = cluster['ssh_to']
            try:
                # extract node kind if this is a node name (e.g., `master001` => `master`)
                parts = NodeNamingPolicy.parse(ssh_to)
                ssh_to = parts['kind']
            except ValueError:
                pass
            if ssh_to not in cluster['nodes']:
                log.error(
                    "Cluster `%s` is configured to SSH into nodes of kind `%s`,"
                    " but no such kind is defined.", name, ssh_to)
                valid = False

        # EC2-specific checks
        if cluster['cloud']['provider'] == 'ec2_boto':
            cluster_uses_vpc = ('vpc' in cluster['cloud'])
            for groupname, properties in cluster['nodes'].items():
                if cluster_uses_vpc and 'network_ids' not in properties:
                    log.error(
                        "Node group `%s/%s` is being used in a VPC,"
                        " so it must specify ``network_ids``.", cluster,
                        groupname)
                    if evict_on_error:
                        valid = False
                        break
                if not cluster_uses_vpc and 'network_ids' in properties:
                    log.error(
                        "Cluster `%s` must specify a VPC"
                        " to place `%s` instances in network `%s`", cluster,
                        groupname, properties['network_ids'])
                    if evict_on_error:
                        valid = False
                        break
        if not valid:
            log.error("Dropping cluster `%s` because of the above errors",
                      name)
            del objtree['cluster'][name]
    return objtree
Example #57
0
    def execute(self):
        """
        Starts a new cluster.
        """

        cluster_template = self.params.cluster
        if self.params.cluster_name:
            cluster_name = self.params.cluster_name
        else:
            cluster_name = self.params.cluster

        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)

        # overwrite configuration
        cluster_nodes_conf = creator.cluster_conf[cluster_template]['nodes']
        for kind, num in self.params.nodes_override.iteritems():
            if kind not in cluster_nodes_conf:
                raise ConfigurationError(
                    "No node group `{kind}` defined"
                    " in cluster template `{template}`"
                    .format(kind=kind, template=cluster_template))
            cluster_nodes_conf[kind]['num'] = num

        # First, check if the cluster is already created.
        try:
            cluster = creator.load_cluster(cluster_name)
        except ClusterNotFound:
            try:
                cluster = creator.create_cluster(
                    cluster_template, cluster_name)
            except ConfigurationError as err:
                log.error("Starting cluster %s: %s", cluster_template, err)
                return

        try:
            print("Starting cluster `{0}` with:".format(cluster.name))
            for cls in cluster.nodes:
                print("* {0:d} {1} nodes.".format(len(cluster.nodes[cls]), cls))
            print("(This may take a while...)")
            min_nodes = dict((kind, cluster_nodes_conf[kind]['min_num'])
                             for kind in cluster_nodes_conf)
            cluster.start(min_nodes=min_nodes)
            if self.params.no_setup:
                print("NOT configuring the cluster as requested.")
            else:
                print("Configuring the cluster.")
                print("(this too may take a while...)")
                ret = cluster.setup()
                if ret:
                    print("Your cluster is ready!")
                else:
                    print("\nWARNING: YOUR CLUSTER IS NOT READY YET!")
            print(cluster_summary(cluster))
        except (KeyError, ImageError, SecurityGroupError, ClusterError) as err:
            log.error("Could not start cluster `%s`: %s", cluster.name, err)
            raise
Example #58
0
    def execute(self):
        """
        Starts a new cluster.
        """

        cluster_template = self.params.cluster
        if self.params.cluster_name:
            cluster_name = self.params.cluster_name
        else:
            cluster_name = self.params.cluster

        configurator = get_configurator(self.params.config,
                                        storage_path=self.params.storage,
                                        include_config_dirs=True)

        # overwrite configuration
        for option, value in self.params.extra_conf.items():
            cconf = configurator.cluster_conf[cluster_template]['cluster']
            if option in cconf:
                cconf[option] = value

        # First, check if the cluster is already created.
        try:
            cluster = configurator.load_cluster(cluster_name)
        except ClusterNotFound as e:
            try:
                cluster = configurator.create_cluster(cluster_template,
                                                      cluster_name)
            except ConfigurationError as e:
                log.error("Starting cluster %s: %s\n" % (cluster_template, e))
                return

        try:

            for cls in cluster.nodes:
                print("Starting cluster `%s` with %d %s nodes." %
                      (cluster.name, len(cluster.nodes[cls]), cls))
            print("(this may take a while...)")
            conf = configurator.cluster_conf[cluster_template]
            min_nodes = dict((k[:-10], int(v))
                             for k, v in conf['cluster'].items()
                             if k.endswith('_nodes_min'))
            cluster.start(min_nodes=min_nodes)
            if self.params.no_setup:
                print("NOT configuring the cluster as requested.")
            else:
                print("Configuring the cluster.")
                print("(this too may take a while...)")
                ret = cluster.setup()
                if ret:
                    print("Your cluster is ready!")
                else:
                    print("\nWARNING: YOUR CLUSTER IS NOT READY YET!")
            print(cluster_summary(cluster))
        except (KeyError, ImageError, SecurityGroupError, ClusterError) as ex:
            print("Your cluster could not start `%s`" % ex)
            raise
Example #59
0
    def connect(self, keyfile=None):
        """Connect to the node via ssh using the paramiko library.

        :return: :py:class:`paramiko.SSHClient` - ssh connection or None on
                 failure
        """
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        if keyfile and os.path.exists(keyfile):
            ssh.load_host_keys(keyfile)

        # Try connecting using the `preferred_ip`, if
        # present. Otherwise, try all of them and set `preferred_ip`
        # using the first that is working.
        ips=self.ips[:]
        # This is done in order to "sort" the IPs and put the preferred_ip first.
        if self.preferred_ip:
            if self.preferred_ip in ips:
                ips.remove(self.preferred_ip)
            else:
                # Preferred is changed?
                log.debug("IP %s does not seem to belong to %s anymore. Ignoring!", self.preferred_ip, self.name)
                self.preferred_ip = ips[0]

   
        for ip in itertools.chain([self.preferred_ip], ips):
            if not ip:
                continue
            try:
                log.debug("Trying to connect to host %s (%s)",
                          self.name, ip)
                addr, port = parse_ip_address_and_port(ip, SSH_PORT)
                ssh.connect(str(addr),
                            username=self.image_user,
                            allow_agent=True,
                            key_filename=self.user_key_private,
                            timeout=Node.connection_timeout,
                            port=port)
                log.debug("Connection to %s succeeded on port %d!", ip, port)
                if ip != self.preferred_ip:
                    log.debug("Setting `preferred_ip` to %s", ip)
                    self.preferred_ip = ip
                    cluster_changed = True
                # Connection successful.
                return ssh
            except socket.error as ex:
                log.debug("Host %s (%s) not reachable: %s.",
                          self.name, ip, ex)
            except paramiko.BadHostKeyException as ex:
                log.error("Invalid host key: host %s (%s); check keyfile: %s",
                          self.name, ip, keyfile)
            except paramiko.SSHException as ex:
                log.debug("Ignoring error %s connecting to %s",
                          str(ex), self.name)

        return None