def _connect(self): """ Connects to the ec2 cloud provider """ # check for existing connection if self._connection: return self._connection try: log.debug("Connecting to ec2 host %s", self._ec2host) region = ec2.regioninfo.RegionInfo(name=self._region_name, endpoint=self._ec2host) # connect to webservice self._connection = boto.connect_ec2( aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key, is_secure=self._secure, host=self._ec2host, port=self._ec2port, path=self._ec2path, region=region) # list images to see if the connection works log.debug("Connection has been successful.") # images = self._connection.get_all_images() # log.debug("%d images found on cloud %s", # len(images), self._ec2host) except Exception as e: log.error("connection to cloud could not be " "established: message=`%s`", str(e)) raise return self._connection
def _allocate_address(self, instance): """Allocates a free public ip address to the given instance :param instance: instance to assign address to :type instance: py:class:`boto.ec2.instance.Reservation` :return: public ip address """ connection = self._connect() free_addresses = [ ip for ip in connection.get_all_addresses() if not ip.instance_id ] if not free_addresses: try: address = connection.allocate_address() except Exception as ex: log.error( "Unable to allocate a public IP address to instance `%s`", instance.id) return None try: address = free_addresses.pop() instance.use_ip(address) return address.public_ip except Exception as ex: log.error("Unable to associate IP address %s to instance `%s`", address, instance.id) return None
def execute(self): """ Starts a new cluster. """ cluster_template = self.params.cluster if self.params.cluster_name: cluster_name = self.params.cluster_name else: cluster_name = self.params.cluster # First, check if the cluster is already created. try: cluster = Configurator().load_cluster(cluster_name) except ClusterNotFound, ex: if self.params.cluster_name: self.params.extra_conf['name'] = self.params.cluster_name try: cluster = Configurator().create_cluster( cluster_template, **self.params.extra_conf) except ConfigurationError, ex: log.error("Starting cluster %s: %s\n", cluster_template, ex) return
def _stop_all_nodes(self, wait=False): """ Terminate all cluster nodes. Return number of failures. """ failed = 0 for node in self.get_all_nodes(): if not node.instance_id: log.warning( "Node `%s` has no instance ID." " Assuming it did not start correctly," " so removing it anyway from the cluster.", node.name) self.nodes[node.kind].remove(node) continue # try and stop node try: # wait and pause for and recheck. node.stop(wait) self.nodes[node.kind].remove(node) log.debug( "Removed node `%s` from cluster `%s`", node.name, self.name) except InstanceNotFoundError as err: log.info( "Node `%s` (instance ID `%s`) was not found;" " assuming it has already been terminated.", node.name, node.instance_id) except Exception as err: failed += 1 log.error( "Could not stop node `%s` (instance ID `%s`): %s %s", node.name, node.instance_id, err, err.__class__) return failed
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None): """ Starts an instance in the cloud on the specified cloud provider (configuration option) and returns the id of the started instance. """ connection = self._connect() log.debug("Checking keypair `%s`.", key_name) self._check_keypair(key_name, public_key_path, private_key_path) log.debug("Checking security group `%s`.", security_group) self._check_security_group(security_group) # image_id = self._find_image_id(image_id) try: reservation = connection.run_instances( image_id, key_name=key_name, security_groups=[security_group], instance_type=flavor, user_data=image_userdata) except Exception, ex: log.error("Error starting instance: %s", ex) if "TooManyInstances" in ex: raise ClusterError(ex) else: raise InstanceError(ex)
def execute(self): creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) cluster.update() except (ClusterNotFound, ConfigurationError) as ex: log.error("Setting up cluster %s: %s", cluster_name, ex) return # XXX: the default value of `self.params.ssh_to` should = the # default value for `ssh_to` in `Cluster.get_ssh_to_node()` frontend = cluster.get_ssh_to_node(self.params.ssh_to) host = frontend.connection_ip() if not host: log.error("No IP address known for node %s", frontend.name) sys.exit(1) addr, port = parse_ip_address_and_port(host) username = frontend.image_user knownhostsfile = (cluster.known_hosts_file if cluster.known_hosts_file else '/dev/null') sftp_cmdline = [ "sftp", "-P", "{0:d}".format(port), "-o", "UserKnownHostsFile={0}".format(knownhostsfile), "-o", "StrictHostKeyChecking=yes", "-o", "IdentityFile={0}".format(frontend.user_key_private), ] sftp_cmdline.extend(self.params.sftp_args) sftp_cmdline.append('{0}@{1}'.format(username, addr)) os.execlp("sftp", *sftp_cmdline)
def _dereference_config_tree(tree, evict_on_error=True): """ Modify `tree` in-place replacing cross-references by section name with the actual section content. For example, if a cluster section lists a key/value pair ``'login': '******'``, this will be replaced with ``'login': { ... }``. """ to_evict = [] for cluster_name, cluster_conf in tree['cluster'].iteritems(): for key in ['cloud', 'login', 'setup']: refname = cluster_conf[key] if refname in tree[key]: # dereference cluster_conf[key] = tree[key][refname] else: log.error( "Configuration section `cluster/%s`" " references non-existing %s section `%s`." " %s", cluster_name, key, refname, ("Dropping cluster definition." if evict_on_error else "")) if evict_on_error: to_evict.append(cluster_name) break for cluster_name in to_evict: del tree['cluster'][cluster_name] return tree
def sigint_handler(signal, frame): """ Makes sure the cluster is stored, before the sigint results in exiting during the node startup. """ log.error("user interruption: saving cluster before exit.") self.keep_running = False
def _allocate_address(self, instance): """Allocates a free public ip address to the given instance :param instance: instance to assign address to :type instance: py:class:`boto.ec2.instance.Reservation` :return: public ip address """ connection = self._connect() free_addresses = [ ip for ip in connection.get_all_addresses() if not ip.instance_id] if not free_addresses: try: address = connection.allocate_address() except Exception as ex: log.error("Unable to allocate a public IP address to instance `%s`", instance.id) return None try: address = free_addresses.pop() instance.use_ip(address) return address.public_ip except Exception as ex: log.error("Unable to associate IP address %s to instance `%s`", address, instance.id) return None
def resume_instance(self, paused_info): """Restarts a paused instance, retaining disk and config. :param str instance_id: instance identifier :raises: `InstanceError` if instance cannot be resumed. :return: dict - information needed to restart instance. """ if not paused_info.get("instance_id"): log.info("Instance to stop has no instance id.") return gce = self._connect() try: request = gce.instances().start( project=self._project_id, instance=paused_info["instance_id"], zone=self._zone) operation = self._execute_request(request) response = self._wait_until_done(operation) self._check_response(response) return except HttpError as e: log.error("Error restarting instance: `%s", e) raise InstanceError("Error restarting instance `%s`", e)
def _allocate_address(self, instance): """Allocates a free public ip address to the given instance :param instance: instance to assign address to :type instance: py:class:`boto.ec2.instance.Reservation` :return: public ip address """ connection = self._connect() addresses = connection.get_all_addresses() for address in addresses: # Find an unused address if not address.instance_id: # Free address, use it. instance.use_ip(address) log.debug("Assigning ip address `%s` to instance `%s`" % (address.public_ip, instance.id)) return address.public_ip # No allocated addresses available. try: address = connection.allocate_address() instance.use_ip(address) return address.public_ip except Exception, ex: log.error("Unable to allocate a public IP address to instance `%s`", instance.id)
def execute(self): """ Load the cluster and build a GC3Pie configuration snippet. """ log.warning( "Command `elasticluster gc3pie-config` is DEPRECATED" " and will be removed in release 1.4 of ElastiCluster") creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) except (ClusterNotFound, ConfigurationError) as ex: log.error("Listing nodes from cluster %s: %s", cluster_name, ex) return from elasticluster.gc3pie_config import create_gc3pie_config_snippet if self.params.append: path = os.path.expanduser(self.params.append) try: fd = open(path, 'a') fd.write(create_gc3pie_config_snippet(cluster)) fd.close() except IOError as ex: log.error("Unable to write configuration to file %s: %s", path, ex) else: print(create_gc3pie_config_snippet(cluster))
def pause_instance(self, instance_id): """Pauses the instance, retaining disk and config. :param str instance_id: instance identifier :raises: `InstanceError` if instance cannot be paused :return: dict - information needed to restart instance. """ if not instance_id: log.info("Instance to pause has no instance id.") return gce = self._connect() try: request = gce.instances().stop(project=self._project_id, instance=instance_id, zone=self._zone) operation = self._execute_request(request) response = self._wait_until_done(operation) self._check_response(response) return {"instance_id": instance_id} except HttpError as e: log.error("Error stopping instance: `%s", e) raise InstanceError("Error stopping instance `%s`", e)
def resume_instance(self, paused_info): """Restarts a paused instance, retaining disk and config. :param str instance_id: instance identifier :raises: `InstanceError` if instance cannot be resumed. :return: dict - information needed to restart instance. """ if not paused_info.get("instance_id"): log.info("Instance to stop has no instance id.") return gce = self._connect() try: request = gce.instances().start(project=self._project_id, instance=paused_info["instance_id"], zone=self._zone) operation = self._execute_request(request) response = self._wait_until_done(operation) self._check_response(response) return except HttpError as e: log.error("Error restarting instance: `%s", e) raise InstanceError("Error restarting instance `%s`", e)
def _decrypt(self,text): cryptor = AES.new('1234567890123456',AES.MODE_CBC,b'0000000000000000') try: plain_text = cryptor.decrypt(a2b_hex(text)) return plain_text.rstrip('+') except TypeError,e: log.error("Your username/password seems not be encrypted:" + e.message)
def execute(self): configurator = Configurator.fromConfig( self.params.config, storage_path=self.params.storage) storage = configurator.create_cluster_storage() cluster_names = storage.get_stored_clusters() if not cluster_names: print("No clusters found.") else: print(""" The following clusters have been started. Please note that there's no guarantee that they are fully configured: """) for name in sorted(cluster_names): try: cluster = configurator.load_cluster(name) except ConfigurationError, ex: log.error("gettin information from cluster `%s`: %s", name, ex) continue print("%s " % name) print("-" * len(name)) print(" name: %s" % cluster.name) print(" template: %s" % cluster.template) print(" cloud: %s " % cluster._cloud) for cls in cluster.nodes: print(" - %s nodes: %d" % (cls, len(cluster.nodes[cls]))) print("")
def execute(self): """ Starts a new cluster. """ cluster_template = self.params.cluster if self.params.cluster_name: cluster_name = self.params.cluster_name else: cluster_name = self.params.cluster configurator = Configurator.fromConfig( self.params.config, storage_path=self.params.storage, include_config_dirs=True) # overwrite configuration for option, value in self.params.extra_conf.iteritems(): cconf = configurator.cluster_conf[cluster_template]['cluster'] if option in cconf: cconf[option] = value # First, check if the cluster is already created. try: cluster = configurator.load_cluster(cluster_name) except ClusterNotFound as e: try: cluster = configurator.create_cluster( cluster_template, cluster_name) except ConfigurationError, e: log.error("Starting cluster %s: %s\n" % (cluster_template, e)) return
def _start_node(node): """Static method to start a specific node on a cloud :return: bool -- True on success, False otherwise """ log.debug("_start_node: working on node %s" % node.name) # TODO: the following check is not optimal yet. When a # node is still in a starting state, # it will start another node here, # since the `is_alive` method will only check for # running nodes (see issue #13) if node.is_alive(): log.info("Not starting node %s which is " "already up&running.", node.name) return True else: try: node.start() log.info("_start_node: node has been started") return True except KeypairError as e: return e except Exception as e: log.error("could not start node `%s` for reason " "`%s`" % (node.name, e)) return None
def cluster_summary(cluster): try: frontend = cluster.get_frontend_node().name except NodeNotFound, ex: frontend = 'unknown' log.error("Unable to get information on the frontend node: " "%s", str(ex))
def execute(self): creator = make_creator(self.params.config, storage_path=self.params.storage) config = creator.cluster_conf print("""%d cluster templates found in configuration file.""" % len(config)) templates = config.keys() for pattern in self.params.clusters: templates = [t for t in templates if fnmatch(t, pattern)] if self.params.clusters: print("""%d cluter templates found matching pattern(s) '%s'""" % (len(templates), str.join(", ", self.params.clusters))) for template in templates: try: cluster = creator.create_cluster(template, template) print(""" name: %s""" % template) for nodekind in cluster.nodes: print("%s nodes: %d" % (nodekind, len(cluster.nodes[nodekind]))) except ConfigurationError as ex: log.error("unable to load cluster `%s`: %s", template, ex)
def _connect(self): """Connects to the ec2 cloud provider :return: :py:class:`boto.ec2.connection.EC2Connection` :raises: Generic exception on error """ # check for existing connection if self._connection: return self._connection try: log.debug("Connecting to ec2 host %s", self._ec2host) region = ec2.regioninfo.RegionInfo(name=self._region_name, endpoint=self._ec2host) # connect to webservice self._connection = boto.connect_ec2( aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key, is_secure=self._secure, host=self._ec2host, port=self._ec2port, path=self._ec2path, region=region) # list images to see if the connection works log.debug("Connection has been successful.") # images = self._connection.get_all_images() # log.debug("%d images found on cloud %s", # len(images), self._ec2host) except Exception as e: log.error("connection to cloud could not be " "established: message=`%s`", str(e)) raise return self._connection
def __init_keystone_session_v2(self, check=False): """Create and return a session object using Keystone API v2.""" from keystoneauth1 import loading as keystone_v2 loader = keystone_v2.get_plugin_loader('password') auth = loader.load_from_options( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v2 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self._compute_api_version, session=sess, cacert=self._os_cacert) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v2 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error("OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v2 session is valid log.info("Using Keystone API v2 session to authenticate to OpenStack") return sess
def start_node(node_queue): try: while not node_queue.empty(): if not self.keep_running: log.error("Aborting execution upon CTRL-C") break node = node_queue.get() # TODO: the following check is not optimal yet. When a # node is still in a starting state, # it will start another node here, # since the `is_alive` method will only check for # running nodes (see issue #13) if node.is_alive(): log.info("Not starting node %s which is " "already up&running.", node.name) else: log.info("starting node...") try: node.start() except (InstanceError, SecurityGroupError, KeypairError, ImageError) as e: log.error("could not start node `%s` for reason " "`%s`" % (node.name, e)) except Empty: # nothing to do if the queue turns out to be empty - the # nodes are then already started. pass
def stop(self, force=False): """Destroys all instances of this cluster and calls delete on the repository. :param bool force: force termination of instances in any case """ for node in self.get_all_nodes(): if node.instance_id: try: node.stop() self.nodes[node.kind].remove(node) log.debug("Removed node with instance id %s from %s" % (node.instance_id, node.kind)) except: # Boto does not always raises an `Exception` class! log.error("could not stop instance `%s`, it might " "already be down.", node.instance_id) else: log.debug("Not stopping node with no instance id. It seems " "like node `%s` did not start correctly." % node.name) self.nodes[node.kind].remove(node) if not self.get_all_nodes(): log.debug("Removing cluster %s.", self.name) self._setup_provider.cleanup(self) self.repository.delete(self) elif not force: log.warning("Not all instances have been terminated. " "Please rerun the `elasticluster stop %s`", self.name) self.repository.save_or_update(self) else: log.warning("Not all instances have been terminated. However, " "as requested, the cluster has been force-removed.") self._setup_provider.cleanup(self) self.repository.delete(self)
def stop(self, force=False): """ Terminates all instances corresponding to this cluster and deletes the cluster storage. """ for node in self.get_all_nodes(): if node.instance_id: try: node.stop() self.nodes[node.type].remove(node) log.debug("Removed node with instance id %s from %s" % (node.instance_id, node.type)) except: # Boto does not always raises an `Exception` class! log.error("could not stop instance `%s`, it might " "already be down.", node.instance_id) else: log.debug("Not stopping node with no instance id. It seems " "like node `%s` did not start correctly." % node.name) self.nodes[node.type].remove(node) if not self.get_all_nodes(): log.debug("Removing cluster %s.", self.name) self._setup_provider.cleanup() self._storage.delete_cluster(self.name) elif not force: log.warning("Not all instances have been terminated. " "Please rerun the `elasticluster stop %s`", self.name) self._storage.dump_cluster(self) else: log.warning("Not all instances have been terminated. However, " "as requested, the cluster has been force-removed.") self._setup_provider.cleanup() self._storage.delete_cluster(self.name)
def _validate_and_convert(cfgtree, evict_on_error=True): objtree = {} for section, model in SCHEMA.iteritems(): if section not in cfgtree: continue stanzas = cfgtree[section] objtree[section] = {} for name, properties in stanzas.iteritems(): log.debug("Checking section `%s/%s` ...", section, name) try: objtree[section][name] = Schema(model).validate(properties) # further checks for cloud providers if section == 'cloud': objtree[section][name] = _validate_cloud_section(objtree[section][name]) # check node name pattern in clusters conforms to RFC952 if section == 'cluster': _validate_node_group_names(objtree[section][name]) except (SchemaError, ValueError) as err: log.error("In section `%s/%s`: %s", section, name, err) if evict_on_error: log.error( "Dropping configuration section `%s/%s`" " because of the above errors", section, name) # `objtree[section][name]` exists if the except was raised # by the second validation (line 650) if name in objtree[section]: del objtree[section][name] return objtree
def cluster_summary(cluster): try: frontend = cluster.get_ssh_to_node().name except NodeNotFound as ex: frontend = 'unknown' log.error("Unable to get information on the frontend node: %s", ex) msg = """ Cluster name: %s Cluster template: %s Default ssh to node: %s """ % (cluster.name, cluster.template, frontend) for cls in cluster.nodes: msg += "- %s nodes: %d\n" % (cls, len(cluster.nodes[cls])) msg += """ To login on the frontend node, run the command: hwcc ssh %s To upload or download files to the cluster, use the command: hwcc sftp %s To enable slurm power saving options, use the steps: 1:check the config file, make sure the value of global_var_slurm_suspendtime is not -1 2:run the script: sh Initslurm.sh """ % (cluster.name, cluster.name) return msg
def inspect_node(node): """ This function accept a `elasticluster.cluster.Node` class, connects to a node and tries to discover the kind of batch system installed, and some other information. """ node_information = {} ssh = node.connect() if not ssh: log.error("Unable to connect to node %s", node.name) return (_in, _out, _err) = ssh.exec_command("(type >& /dev/null -a srun && echo slurm) \ || (type >& /dev/null -a qconf && echo sge) \ || (type >& /dev/null -a pbsnodes && echo pbs) \ || echo UNKNOWN") node_information['type'] = _out.read().strip() (_in, _out, _err) = ssh.exec_command("arch") node_information['architecture'] = _out.read().strip() if node_information['type'] == 'slurm': inspect_slurm_cluster(ssh, node_information) elif node_information['type'] == 'sge': inspect_sge_cluster(ssh, node_information) ssh.close() return node_information
def setup(self, extra_args=tuple()): """ Configure the cluster nodes. Actual action is delegated to the :py:class:`elasticluster.providers.AbstractSetupProvider` that was provided at construction time. :param list extra_args: List of additional command-line arguments that are appended to each invocation of the setup program. :return: bool - True on success, False otherwise """ try: # setup the cluster using the setup provider ret = self._setup_provider.setup_cluster(self, extra_args) except Exception as err: log.error( "The cluster hosts are up and running," " but %s failed to set the cluster up: %s", self._setup_provider.HUMAN_READABLE_NAME, err) ret = False if not ret: log.warning( "Cluster `%s` not yet configured. Please, re-run " "`elasticluster setup %s` and/or check your configuration", self.name, self.name) return ret
def execute(self): creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) cluster.update() except (ClusterNotFound, ConfigurationError) as ex: log.error("Setting up cluster %s: %s\n" % (cluster_name, ex)) return if self.params.ssh_to: try: nodes = dict((n.name,n) for n in cluster.get_all_nodes()) frontend = nodes[self.params.ssh_to] except KeyError: raise ValueError( "Hostname %s not found in cluster %s" % (self.params.ssh_to, cluster_name)) else: frontend = cluster.get_frontend_node() host = f rontend.connection_ip() username = frontend.image_user knownhostsfile = cluster.known_hosts_file if cluster.known_hosts_file \ else '/dev/null' sftp_cmdline = ["sftp", "-o", "UserKnownHostsFile=%s" % knownhostsfile, "-o", "StrictHostKeyChecking=yes", "-o", "IdentityFile=%s" % frontend.user_key_private] sftp_cmdline.extend(self.params.sftp_args) sftp_cmdline.append('%s@%s' % (username, host)) os.execlp("sftp", *sftp_cmdline)
def execute(self): """ Lists all nodes within the specified cluster with certain information like id and ip. """ creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) if self.params.update: cluster.update() except (ClusterNotFound, ConfigurationError) as ex: log.error("Listing nodes from cluster %s: %s", cluster_name, ex) return if self.params.pretty_json: print(json.dumps(cluster, default=dict, indent=4)) elif self.params.json: print(json.dumps(cluster, default=dict)) else: print(cluster_summary(cluster)) for cls in cluster.nodes: print("%s nodes:" % cls) print("") for node in cluster.nodes[cls]: txt = [" " + i for i in node.pprint().splitlines()] print(' - ' + "\n".join(txt)[4:]) print("")
def execute(self): """ Stops the cluster if it's running. """ cluster_name = self.params.cluster configurator = get_configurator(self.params.config, storage_path=self.params.storage, include_config_dirs=True) try: cluster = configurator.load_cluster(cluster_name) except (ClusterNotFound, ConfigurationError) as ex: log.error("Stopping cluster %s: %s\n" % (cluster_name, ex)) return if not self.params.yes: # Ask for confirmation yesno = raw_input( "Do you want really want to stop " "cluster %s? [yN] " % cluster_name) if yesno.lower() not in ['yes', 'y']: print("Aborting as per user request.") sys.exit(0) print("Destroying cluster `%s`" % cluster_name) cluster.stop(force=self.params.force)
def __init_keystone_session_v2(self, check=False): """Create and return a session object using Keystone API v2.""" from keystoneauth1 import loading as keystone_v2 loader = keystone_v2.get_plugin_loader('password') auth = loader.load_from_options( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v2 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self._compute_api_version, session=sess, cacert=self._os_cacert) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v2 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error( "OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v2 session is valid log.info("Using Keystone API v2 session to authenticate to OpenStack") return sess
def execute(self): """ Starts a new cluster. """ cluster_template = self.params.cluster if self.params.cluster_name: cluster_name = self.params.cluster_name else: cluster_name = self.params.cluster configurator = Configurator.fromConfig( self.params.config, storage_path=self.params.storage) # overwrite configuration for option, value in self.params.extra_conf.iteritems(): cconf = configurator.cluster_conf[cluster_template]['cluster'] if option in cconf: cconf[option] = value # First, check if the cluster is already created. try: cluster = configurator.load_cluster(cluster_name) except ClusterNotFound as e: try: cluster = configurator.create_cluster( cluster_template, cluster_name) except ConfigurationError, e: log.error("Starting cluster %s: %s\n" % (cluster_template, e)) return
def setup(self): """Configure the cluster nodes with the specified This is delegated to the provided :py:class:`elasticluster.providers.AbstractSetupProvider` :return: bool - True on success, False otherwise """ try: # setup the cluster using the setup provider ret = self._setup_provider.setup_cluster(self) except Exception as e: log.error( "the setup provider was not able to setup the cluster, " "but the cluster is running by now. Setup provider error " "message: `%s`", str(e), ) ret = False if not ret: log.warning( "Cluster `%s` not yet configured. Please, re-run " "`elasticluster setup %s` and/or check your configuration", self.name, self.name, ) return ret
def cluster_summary(cluster): try: frontend = cluster.get_frontend_node().name except NodeNotFound as ex: frontend = 'unknown' log.error("Unable to get information on the frontend node: " "%s", str(ex)) msg = """ Cluster name: %s Cluster template: %s Default ssh to node: %s """ % (cluster.name, cluster.template, frontend) for cls in cluster.nodes: msg += "- %s nodes: %d\n" % (cls, len(cluster.nodes[cls])) msg += """ To login on the frontend node, run the command: elasticluster ssh %s To upload or download files to the cluster, use the command: elasticluster sftp %s """ % (cluster.name, cluster.name) return msg
def execute(self): """ Load the cluster and build a GC3Pie configuration snippet. """ creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) except (ClusterNotFound, ConfigurationError) as ex: log.error("Listing nodes from cluster %s: %s\n" % (cluster_name, ex)) return from elasticluster.gc3pie_config import create_gc3pie_config_snippet if self.params.append: path = os.path.expanduser(self.params.append) try: fd = open(path, 'a') fd.write(create_gc3pie_config_snippet(cluster)) fd.close() except IOError as ex: log.error("Unable to write configuration to file %s: %s", path, ex) else: print(create_gc3pie_config_snippet(cluster))
def execute(self): """ Lists all nodes within the specified cluster with certain information like id and ip. """ creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) if self.params.update: cluster.update() except (ClusterNotFound, ConfigurationError) as ex: log.error("Listing nodes from cluster %s: %s\n" % (cluster_name, ex)) return if self.params.pretty_json: print(json.dumps(cluster, default=dict, indent=4)) elif self.params.json: print(json.dumps(cluster, default=dict)) else: print(cluster_summary(cluster)) for cls in cluster.nodes: print("%s nodes:" % cls) print("") for node in cluster.nodes[cls]: txt = [" " + i for i in node.pprint().splitlines()] print(' - ' + str.join("\n", txt)[4:]) print("")
def execute(self): """ Load the cluster and build a GC3Pie configuration snippet. """ log.warning("Command `elasticluster gc3pie-config` is DEPRECATED" " and will be removed in release 1.4 of ElastiCluster") creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) except (ClusterNotFound, ConfigurationError) as ex: log.error("Listing nodes from cluster %s: %s", cluster_name, ex) return from elasticluster.gc3pie_config import create_gc3pie_config_snippet if self.params.append: path = os.path.expanduser(self.params.append) try: fd = open(path, 'a') fd.write(create_gc3pie_config_snippet(cluster)) fd.close() except IOError as ex: log.error("Unable to write configuration to file %s: %s", path, ex) else: print(create_gc3pie_config_snippet(cluster))
def _validate_and_convert(cfgtree, evict_on_error=True): objtree = {} for section, model in SCHEMA.iteritems(): if section not in cfgtree: continue stanzas = cfgtree[section] objtree[section] = {} for name, properties in stanzas.iteritems(): log.debug("Checking section `%s/%s` ...", section, name) try: objtree[section][name] = Schema(model).validate(properties) # further checks for cloud providers if section == 'cloud': objtree[section][name] = _validate_cloud_section( objtree[section][name]) # check node name pattern in clusters conforms to RFC952 if section == 'cluster': _validate_node_group_names(objtree[section][name]) except (SchemaError, ValueError) as err: log.error("In section `%s/%s`: %s", section, name, err) if evict_on_error: log.error( "Dropping configuration section `%s/%s`" " because of the above errors", section, name) # `objtree[section][name]` exists if the except was raised # by the second validation (line 650) if name in objtree[section]: del objtree[section][name] return objtree
def execute(self): creator = make_creator(self.params.config, storage_path=self.params.storage) config = creator.cluster_conf print("""%d cluster templates found in configuration file.""" % len(config)) templates = config.keys() for pattern in self.params.clusters: templates = [t for t in templates if fnmatch(t, pattern)] if self.params.clusters: print("""%d cluter templates found matching pattern(s) '%s'""" % (len(templates), str.join(", ", self.params.clusters))) for template in templates: try: cluster = creator.create_cluster(template, template) print(""" name: %s""" % template) for nodekind in cluster.nodes: print("%s nodes: %d" % ( nodekind, len(cluster.nodes[nodekind]))) except ConfigurationError as ex: log.error("unable to load cluster `%s`: %s", template, ex)
def execute(self): creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) cluster.update() except (ClusterNotFound, ConfigurationError) as ex: log.error("Setting up cluster %s: %s\n" % (cluster_name, ex)) return if self.params.ssh_to: try: nodes = dict((n.name,n) for n in cluster.get_all_nodes()) frontend = nodes[self.params.ssh_to] except KeyError: raise ValueError( "Hostname %s not found in cluster %s" % (self.params.ssh_to, cluster_name)) else: frontend = cluster.get_frontend_node() host = frontend.connection_ip() username = frontend.image_user knownhostsfile = cluster.known_hosts_file if cluster.known_hosts_file \ else '/dev/null' sftp_cmdline = ["sftp", "-o", "UserKnownHostsFile=%s" % knownhostsfile, "-o", "StrictHostKeyChecking=yes", "-o", "IdentityFile=%s" % frontend.user_key_private] sftp_cmdline.extend(self.params.sftp_args) sftp_cmdline.append('%s@%s' % (username, host)) os.execlp("sftp", *sftp_cmdline)
def execute(self): creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) cluster.update() except (ClusterNotFound, ConfigurationError) as ex: log.error("Setting up cluster %s: %s\n" % (cluster_name, ex)) return if self.params.ssh_to: try: nodes = dict((n.name,n) for n in cluster.get_all_nodes()) frontend = nodes[self.params.ssh_to] except KeyError: raise ValueError( "Hostname %s not found in cluster %s" % (self.params.ssh_to, cluster_name)) else: frontend = cluster.get_frontend_node() try: # ensure we can connect to the host if not frontend.preferred_ip: # Ensure we can connect to the node, and save the value of `preferred_ip` ssh = frontend.connect(keyfile=cluster.known_hosts_file) if ssh: ssh.close() cluster.repository.save_or_update(cluster) except NodeNotFound as ex: log.error("Unable to connect to the frontend node: %s" % str(ex)) sys.exit(1) host = frontend.connection_ip() # check for nonstandard port, either IPv4 or IPv6 addr = host port = str(SSH_PORT) if ':' in host: match = IPV6_RE.match(host) if match: addr = match.groups()[0] port = match.groups()[1] else: addr, _, port = host.partition(':') username = frontend.image_user knownhostsfile = cluster.known_hosts_file if cluster.known_hosts_file \ else '/dev/null' ssh_cmdline = ["ssh", "-i", frontend.user_key_private, "-o", "UserKnownHostsFile=%s" % knownhostsfile, "-o", "StrictHostKeyChecking=yes", "-p", port, '%s@%s' % (username, addr)] ssh_cmdline.extend(self.params.ssh_args) log.debug("Running command `%s`" % str.join(' ', ssh_cmdline)) os.execlp("ssh", *ssh_cmdline)
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, **kwargs): """Starts a new instance on the cloud using the given properties. The following tasks are done to start an instance: * establish a connection to the cloud web service * check ssh keypair and upload it if it does not yet exist. This is a locked process, since this function might be called in multiple threads and we only want the key to be stored once. * check if the security group exists * run the instance with the given properties :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :return: str - instance id of the started instance """ connection = self._connect() log.debug("Checking keypair `%s`.", key_name) # the `_check_keypair` method has to be called within a lock, # since it will upload the key if it does not exist and if this # happens for every node at the same time ec2 will throw an error # message (see issue #79) with BotoCloudProvider.__node_start_lock: self._check_keypair(key_name, public_key_path, private_key_path) log.debug("Checking security group `%s`.", security_group) self._check_security_group(security_group) # image_id = self._find_image_id(image_id) try: reservation = connection.run_instances( image_id, key_name=key_name, security_groups=[security_group], instance_type=flavor, user_data=image_userdata) except Exception, ex: log.error("Error starting instance: %s", ex) if "TooManyInstances" in ex: raise ClusterError(ex) else: raise InstanceError(ex)
def sigint_handler(signal, frame): """ Makes sure the cluster is saved, before the sigint results in exiting during node startup. """ log.error("Interrupted: will save cluster state and exit" " after all nodes have started.") keep_running = False
def _cross_validate_final_config(objtree, evict_on_error=True): """ Run validation checks that require correlating values from different sections. """ # take a copy of cluster config as we might be modifying it for name, cluster in list(objtree['cluster'].items()): valid = True # ensure all cluster node kinds are defined in the `setup/*` section setup_sect = cluster['setup'] for groupname, properties in cluster['nodes'].items(): if (groupname + '_groups') not in setup_sect: log.error( "Cluster `%s` requires nodes of kind `%s`," " but no such group is defined" " in the referenced setup section.", name, groupname) valid = False break # ensure `ssh_to` has a valid value if 'ssh_to' in cluster: ssh_to = cluster['ssh_to'] try: # extract node kind if this is a node name (e.g., `master001` => `master`) parts = NodeNamingPolicy.parse(ssh_to) ssh_to = parts['kind'] except ValueError: pass if ssh_to not in cluster['nodes']: log.error( "Cluster `%s` is configured to SSH into nodes of kind `%s`," " but no such kind is defined.", name, ssh_to) valid = False # EC2-specific checks if cluster['cloud']['provider'] == 'ec2_boto': cluster_uses_vpc = ('vpc' in cluster['cloud']) for groupname, properties in cluster['nodes'].items(): if cluster_uses_vpc and 'network_ids' not in properties: log.error( "Node group `%s/%s` is being used in a VPC," " so it must specify ``network_ids``.", cluster, groupname) if evict_on_error: valid = False break if not cluster_uses_vpc and 'network_ids' in properties: log.error( "Cluster `%s` must specify a VPC" " to place `%s` instances in network `%s`", cluster, groupname, properties['network_ids']) if evict_on_error: valid = False break if not valid: log.error("Dropping cluster `%s` because of the above errors", name) del objtree['cluster'][name] return objtree
def execute(self): """ Starts a new cluster. """ cluster_template = self.params.cluster if self.params.cluster_name: cluster_name = self.params.cluster_name else: cluster_name = self.params.cluster creator = make_creator(self.params.config, storage_path=self.params.storage) # overwrite configuration cluster_nodes_conf = creator.cluster_conf[cluster_template]['nodes'] for kind, num in self.params.nodes_override.iteritems(): if kind not in cluster_nodes_conf: raise ConfigurationError( "No node group `{kind}` defined" " in cluster template `{template}`" .format(kind=kind, template=cluster_template)) cluster_nodes_conf[kind]['num'] = num # First, check if the cluster is already created. try: cluster = creator.load_cluster(cluster_name) except ClusterNotFound: try: cluster = creator.create_cluster( cluster_template, cluster_name) except ConfigurationError as err: log.error("Starting cluster %s: %s", cluster_template, err) return try: print("Starting cluster `{0}` with:".format(cluster.name)) for cls in cluster.nodes: print("* {0:d} {1} nodes.".format(len(cluster.nodes[cls]), cls)) print("(This may take a while...)") min_nodes = dict((kind, cluster_nodes_conf[kind]['min_num']) for kind in cluster_nodes_conf) cluster.start(min_nodes=min_nodes) if self.params.no_setup: print("NOT configuring the cluster as requested.") else: print("Configuring the cluster.") print("(this too may take a while...)") ret = cluster.setup() if ret: print("Your cluster is ready!") else: print("\nWARNING: YOUR CLUSTER IS NOT READY YET!") print(cluster_summary(cluster)) except (KeyError, ImageError, SecurityGroupError, ClusterError) as err: log.error("Could not start cluster `%s`: %s", cluster.name, err) raise
def execute(self): """ Starts a new cluster. """ cluster_template = self.params.cluster if self.params.cluster_name: cluster_name = self.params.cluster_name else: cluster_name = self.params.cluster configurator = get_configurator(self.params.config, storage_path=self.params.storage, include_config_dirs=True) # overwrite configuration for option, value in self.params.extra_conf.items(): cconf = configurator.cluster_conf[cluster_template]['cluster'] if option in cconf: cconf[option] = value # First, check if the cluster is already created. try: cluster = configurator.load_cluster(cluster_name) except ClusterNotFound as e: try: cluster = configurator.create_cluster(cluster_template, cluster_name) except ConfigurationError as e: log.error("Starting cluster %s: %s\n" % (cluster_template, e)) return try: for cls in cluster.nodes: print("Starting cluster `%s` with %d %s nodes." % (cluster.name, len(cluster.nodes[cls]), cls)) print("(this may take a while...)") conf = configurator.cluster_conf[cluster_template] min_nodes = dict((k[:-10], int(v)) for k, v in conf['cluster'].items() if k.endswith('_nodes_min')) cluster.start(min_nodes=min_nodes) if self.params.no_setup: print("NOT configuring the cluster as requested.") else: print("Configuring the cluster.") print("(this too may take a while...)") ret = cluster.setup() if ret: print("Your cluster is ready!") else: print("\nWARNING: YOUR CLUSTER IS NOT READY YET!") print(cluster_summary(cluster)) except (KeyError, ImageError, SecurityGroupError, ClusterError) as ex: print("Your cluster could not start `%s`" % ex) raise
def connect(self, keyfile=None): """Connect to the node via ssh using the paramiko library. :return: :py:class:`paramiko.SSHClient` - ssh connection or None on failure """ ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) if keyfile and os.path.exists(keyfile): ssh.load_host_keys(keyfile) # Try connecting using the `preferred_ip`, if # present. Otherwise, try all of them and set `preferred_ip` # using the first that is working. ips=self.ips[:] # This is done in order to "sort" the IPs and put the preferred_ip first. if self.preferred_ip: if self.preferred_ip in ips: ips.remove(self.preferred_ip) else: # Preferred is changed? log.debug("IP %s does not seem to belong to %s anymore. Ignoring!", self.preferred_ip, self.name) self.preferred_ip = ips[0] for ip in itertools.chain([self.preferred_ip], ips): if not ip: continue try: log.debug("Trying to connect to host %s (%s)", self.name, ip) addr, port = parse_ip_address_and_port(ip, SSH_PORT) ssh.connect(str(addr), username=self.image_user, allow_agent=True, key_filename=self.user_key_private, timeout=Node.connection_timeout, port=port) log.debug("Connection to %s succeeded on port %d!", ip, port) if ip != self.preferred_ip: log.debug("Setting `preferred_ip` to %s", ip) self.preferred_ip = ip cluster_changed = True # Connection successful. return ssh except socket.error as ex: log.debug("Host %s (%s) not reachable: %s.", self.name, ip, ex) except paramiko.BadHostKeyException as ex: log.error("Invalid host key: host %s (%s); check keyfile: %s", self.name, ip, keyfile) except paramiko.SSHException as ex: log.debug("Ignoring error %s connecting to %s", str(ex), self.name) return None