def mount_volume_to_instance(instance, foldername): log.info("\t ... mounting to {0} ...".format(instance.id)) instance.ssh.execute("sudo rm -rf {0}".format(foldername)) instance.ssh.execute("sudo mkdir {0}".format(foldername)) instance.ssh.execute("mount /dev/xvdh {0}".format(foldername)) instance.ssh.execute("sudo chown scidb:scidb {0}".format(foldername)) log.info("\t ... success ...")
def cancel_command(self, signum, frame): """ Exits program with return value of 1 """ print log.info("Exiting...") sys.exit(1)
def _install_combblas(self, node): log.info("\tInstalling CombBLAS") instructions = [ "wget -O combblas.tgz %s" % self.combblas_source, "tar xvfz combblas.tgz", "rm combblas.tgz" ] self._follow_instructions(instructions, node) # Expects the combblas.patch file to be in the same directory as this source file patchfname = os.path.dirname(inspect.getsourcefile(SkylarkInstaller)) + '/combblas.patch' log.info(patchfname) node.ssh.put(patchfname, 'CombBLAS/combblas.patch') instructions = [ "cd CombBLAS", "yes | git apply --ignore-space-change --ignore-whitespace combblas.patch", "rm combblas.patch", "cmake .", "make -j %s" % self.nproc, "cp *.so /usr/local/lib", "mkdir /usr/local/include/CombBLAS", "cp *.h /usr/local/include/CombBLAS", "cp *.cpp /usr/local/include/CombBLAS", "cp -R SequenceHeaps /usr/local/include/CombBLAS", "cp -R psort-1.0 /usr/local/include/CombBLAS", "cp -R graph500-1.2 /usr/local/include/CombBLAS", "cd ..", "rm -r CombBLAS" ] self._follow_instructions(instructions, node)
def execute(self, args): if "createimage" in sys.argv: warnings.warn("createimage is deprecated and will go away in the " "next release. please use the s3image/ebsimage " "commands instead", DeprecationWarning) if len(args) != 3: self.parser.error( 'you must specify an instance-id, image name, and bucket') bucket = None instanceid, image_name, bucket = args self.bucket = bucket self.image_name = image_name i = self.ec2.get_instance(instanceid) key_location = self.cfg.get_key(i.key_name).get('key_location') aws_user_id = self.cfg.aws.get('aws_user_id') ec2_cert = self.cfg.aws.get('ec2_cert') ec2_private_key = self.cfg.aws.get('ec2_private_key') try: ami_id = self.ec2.create_s3_image(instanceid, key_location, aws_user_id, ec2_cert, ec2_private_key, bucket, image_name=image_name, **self.specified_options_dict) log.info("Your new AMI id is: %s" % ami_id) except KeyboardInterrupt: raise exception.CancelledS3ImageCreation(self.bucket, self.image_name)
def _upload_image(self): log.info('Uploading bundled image: (please be patient)') conn = self.host_ssh config_dict = self.config_dict conn.execute('ec2-upload-bundle -b %(bucket)s ' '-m /mnt/%(prefix)s.manifest.xml -a %(access_key)s ' '-s %(secret_key)s' % config_dict, silent=False)
def export_fs_to_nodes(self, nodes, export_paths): """ Export each path in export_paths to each node in nodes via NFS nodes - list of nodes to export each path to export_paths - list of paths on this remote host to export to each node Example: # export /home and /opt/sge6 to each node in nodes $ node.start_nfs_server() $ node.export_fs_to_nodes(nodes=[node1,node2], export_paths=['/home', '/opt/sge6']) """ # setup /etc/exports log.info("Configuring NFS exports path(s):\n%s" % ' '.join(export_paths)) nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)" etc_exports = self.ssh.remote_file('/etc/exports', 'r') contents = etc_exports.read() etc_exports.close() etc_exports = self.ssh.remote_file('/etc/exports', 'a') for node in nodes: for path in export_paths: export_line = ' '.join( [path, node.alias + nfs_export_settings + '\n']) if export_line not in contents: etc_exports.write(export_line) etc_exports.close() self.ssh.execute('exportfs -fra')
def start_nfs_server(self): log.info("Starting NFS server on %s" % self.alias) self.ssh.execute('/etc/init.d/portmap start') self.ssh.execute('mount -t rpc_pipefs sunrpc /var/lib/nfs/rpc_pipefs/', ignore_exit_status=True) self.ssh.execute('/etc/init.d/nfs start') self.ssh.execute('/usr/sbin/exportfs -fra')
def run(self, nodes, master, user, user_shell, volumes): if not self.var_str == "": for node in nodes: log.info("Adding vars to: %s " % node.alias) node.ssh.execute('echo \''+self.var_str.replace('\'', '\\\'')+'\' >> .bashrc') if self.envar_location is not None: node.ssh.execute('echo \''+self.var_str.replace('\'', '\\\'')+'\' >> '+self.envar_location)
def root_device_name(self): root_dev = self.instance.root_device_name bmap = self.block_device_mapping if bmap and root_dev not in bmap and self.is_ebs_backed(): # Hack for misconfigured AMIs (e.g. CentOS 6.3 Marketplace) These # AMIs have root device name set to /dev/sda1 but no /dev/sda1 in # block device map - only /dev/sda. These AMIs somehow magically # work so check if /dev/sda exists and return that instead to # prevent detach_external_volumes() from trying to detach the root # volume on these AMIs. log.warn("Root device %s is not in the block device map" % root_dev) log.warn("This means the AMI was registered with either " "an incorrect root device name or an incorrect block " "device mapping") sda, sda1 = '/dev/sda', '/dev/sda1' if root_dev == sda1: log.info("Searching for possible root device: %s" % sda) if sda in self.block_device_mapping: log.warn("Found '%s' - assuming its the real root device" % sda) root_dev = sda else: log.warn("Device %s isn't in the block device map either" % sda) return root_dev
def run_cmd(node, cmd, user, silent=True): log.info("%s@%s: %s" % (user, node.alias, cmd)) if user != 'root': node.ssh.switch_user(user) node.ssh.execute(cmd, silent=silent) if user != 'root': node.ssh.switch_user('root')
def run(self): """ As soon as a new node is ready, run the add plugins commands over it. """ interval = self.cluster.refresh_interval log.info("Waiting for one of the new nodes to be up " "(updating every {}s)".format(interval)) while True: self.ready_instances = [] self.stream_unpropagated_spots() self.stream_spots() self.stream_unpropagated_instances() self.stream_update_nrm() self.stream_instances() self.stream_manage_reboots() self.stream_ready_instances() if any([self.unpropagated_spots, self.spots, self.unpropagated_instances, self.instances]): if self.ready_instances: # ready_instances means nodes were added, that took # time so we should loop again now continue log.info("Sleeping for {} seconds".format(interval)) time.sleep(interval) else: break
def create(self, volume_size, volume_zone, name=None, tags=None): try: self.validate(volume_size, volume_zone, self._aws_block_device) instance = self._request_instance(volume_zone) self._validate_required_progs([self._mkfs_cmd.split()[0]]) self._determine_device() vol = self._create_volume(volume_size, volume_zone) if tags: for tag in tags: tagval = tags.get(tag) tagmsg = "Adding volume tag: %s" % tag if tagval: tagmsg += "=%s" % tagval log.info(tagmsg) vol.add_tag(tag, tagval) if name: vol.add_tag("Name", name) self._attach_volume(self._volume, instance.id, self._aws_block_device) self._get_volume_device(self._aws_block_device) self._format_volume() self.shutdown() log.info("Your new %sGB volume %s has been created successfully" % (volume_size, vol.id)) return vol except Exception: log.error("Failed to create new volume", exc_info=True) self._delete_new_volume() raise finally: self._warn_about_volume_hosts()
def _setup_ebs_volume(self): """ Mount EBS volume, if specified, in ~/.starclustercfg to /home""" # setup /etc/fstab on master to use block device if specified for vol in self._volumes: vol = self._volumes[vol] vol_id = vol.get("volume_id") device = vol.get("device") volume_partition = vol.get("partition") mount_path = vol.get("mount_path") if vol_id and volume_partition and mount_path: log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path)) mconn = self._master.ssh if not mconn.path_exists(device): log.warn("Cannot find device %s for volume %s" % (device, vol)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This usually means there was a problem" + "attaching the EBS volume to the master node") continue if not mconn.path_exists(volume_partition): log.warn("Cannot find partition %s on volume %s" % (volume_partition, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn( "This either means that the volume has not been" + "partitioned or that the partition specified" + "does not exist on the volume" ) continue master_fstab = mconn.remote_file("/etc/fstab", mode="a") print >> master_fstab, "%s %s auto noauto,defaults 0 0 " % (volume_partition, mount_path) master_fstab.close() mconn.execute("mkdir -p %s" % mount_path) mconn.execute("mount %s" % mount_path)
def openNfsPorts(self, group): """ Open (fixed) NFS-related ports (portmap, nfs and mountd) """ portmapport = self.portmapport nfsport = self.nfsport mountdport = self.mountdport log.info("Opening NFS-related ports for group: %s", group) log.debug("automount.openNfsPorts group; %s", group) log.debug("automount.openNfsPorts portmapport; %s", portmapport) log.debug("automount.openNfsPorts nfsport; %s", nfsport) log.debug("automount.openNfsPorts mountdport; %s", mountdport) permissions = [ dict(group=group, port=nfsport, type="tcp"), dict(group=group, port=nfsport, type="udp"), dict(group=group, port=portmapport, type="tcp"), dict(group=group, port=portmapport, type="udp"), dict(group=group, port=mountdport, type="tcp"), dict(group=group, port=mountdport, type="udp") ] #### OPEN PORTS FROM HEAD NODE (NO SSH FROM MASTER) commands = self.setPortCommands(group, permissions) for command in commands: self.runSystemCommand(command);
def list_all_instances(self, show_terminated=False): reservations = self.conn.get_all_instances() if not reservations: log.info("No instances found") for res in reservations: groups = ', '.join([ g.id for g in res.groups]) or 'N/A' for instance in res.instances: if instance.state == 'terminated' and not show_terminated: continue id = instance.id or 'N/A' dns_name = instance.dns_name or 'N/A' private_dns_name = instance.private_dns_name or 'N/A' state = instance.state or 'N/A' private_ip = instance.private_ip_address or 'N/A' public_ip = instance.ip_address or 'N/A' zone = instance.placement or 'N/A' ami = instance.image_id or 'N/A' keypair = instance.key_name or 'N/A' print "id: %s" % id print "dns_name: %s" % dns_name print "private_dns_name: %s" % private_dns_name print "state: %s" % state print "public_ip: %s" % public_ip print "private_ip: %s" % private_ip print "zone: %s" % zone print "ami: %s" % ami print "groups: %s" % groups print "keypair: %s" % keypair print
def __init__(self, privatekey, publiccert, interval, sourcedirs, mountpoints, portmapport, nfsport, mountdport, cluster): log.info("Loaded plugin: automount.NfsShares") log.debug("automount.NfsShares.__init__ Initialising AutoMount plugin.") log.debug("automount.NfsShares.__init__ privatekey %s" % privatekey) log.debug("automount.NfsShares.__init__ publiccert %s" % publiccert) log.debug("automount.NfsShares.__init__ interval %s" % interval) log.debug("automount.NfsShares.__init__ sourcedirs %s" % sourcedirs) log.debug("automount.NfsShares.__init__ mountpoints %s" % mountpoints) log.debug("automount.NfsShares.__init__ portmapport %s" % portmapport) log.debug("automount.NfsShares.__init__ nfsport %s" % nfsport) log.debug("automount.NfsShares.__init__ mountdport %s" % mountdport) log.debug("automount.NfsShares.__init__ cluster %s" % cluster) self.privatekey = privatekey self.publiccert = publiccert self.portmapport = portmapport self.nfsport = nfsport self.mountdport = mountdport self.cluster = cluster # set default interval if not interval: interval = 10 self.interval = interval self.sourcedirs = sourcedirs.split(",") self.mountpoints = mountpoints.split(",") if len(self.sourcedirs) != len(self.mountpoints): log.info("automount.NfsShares.__init__ length of sourcedirs (" + len(self.sourcedirs) + ") is not the same as length of mountpoints (" + len(self.mountpoints) + ")" ) sys.exit(0)
def run(self, nodes, master, user, user_shell, volumes): """ Mount NFS shares on master and all nodes """ log.info("Running plugin automount") log.debug("automount.NfsShares.run automount.NfsShares.run(nodes, master, user, user_shell, volumes)") #### OPEN NFS-RELATED PORTS FOR THIS CLUSTER self.openNfsPorts("default") self.openNfsPorts('@sc-' + self.cluster) #### SET HEAD NODE INTERNAL IP self.getHeadIp(); #### FIX mountd PORT ON head AND MASTER/NODES mountdport = "32767" for node in nodes: self.setMountdOnNode(node, mountdport) self.setMountdOnHead(mountdport) self.restartServicesOnHead() #### MOUNT ON ALL NODES for node in nodes: self.mount(node) log.info("Completed plugin automount")
def wrap_f(func, *arg, **kargs): """Raw timing function """ time1 = time.time() res = func(*arg, **kargs) time2 = time.time() log.info('%s took %0.3f mins' % (prefix, (time2 - time1) / 60.0)) return res
def execute(self, args): if not args: self.parser.error("please specify a cluster") for cluster_name in args: cl = self.cm.get_cluster(cluster_name) is_ebs = cl.is_ebs_cluster() if not self.opts.confirm: action = "Terminate" if is_ebs: action = "Stop EBS" if cl.spot_bid: action = "Terminate Spot EBS" resp = raw_input("%s cluster %s (y/n)? " % (action, cluster_name)) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster() if is_ebs and cl._nodes: log.warn(("All EBS-backed nodes in '%s' are now in a " + \ "'stopped' state") % cluster_name) log.warn("You can restart this cluster by passing -x " + \ "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " + \ "terminate this cluster") log.warn("NOTE: Unless EBS-backed nodes are in a " + \ "'running' or 'terminated'") log.warn("state, you are charged for the EBS volumes " + \ "backing the nodes.")
def addParallelEnvironment(self, master): """ Add 'threaded' parallel environment """ log.info("Adding 'threaded' parallel environment") sge_pe_template = """ pe_name threaded slots %s user_lists NONE xuser_lists NONE start_proc_args /bin/true stop_proc_args /bin/true allocation_rule $pe_slots control_slaves TRUE job_is_first_task FALSE urgency_slots min accounting_summary FALSE """ log.debug("addParallelEnvironment sge_pe_template: %s", sge_pe_template) #### PRINT TEMPLATE FILE pe_file = master.ssh.remote_file("/tmp/pe.txt") print >> pe_file, sge_pe_template % 99999 pe_file.close() envars = self.exportEnvironmentVars() rootpath = self.getRootPath(master) log.debug("CreateCell.addParallelEnvironment rootpath: %s", rootpath) master.ssh.execute(envars + rootpath + "/qconf -Ap %s &> /tmp/pe.out" % pe_file.name) master.ssh.execute(envars + rootpath + '/qconf -mattr queue pe_list "threaded" all.q &> /tmp/pe2q.out')
def restartSge(self, node): """ Restart SGE qmaster (master) and execd (master + nodes) daemons """ log.info("Restarting SGE qmaster and execd daemons") rootpath = self.getRootPath(node) log.debug("CreateCell.restartSge rootpath: %s", rootpath) envars = self.exportEnvironmentVars() stop_execd = envars + rootpath + '/qconf -ke all' stop_qmaster = envars + rootpath + '/qconf -km' start_qmaster = envars + rootpath + '/sge_qmaster' start_execd = envars + rootpath + '/sge_execd' sleep = 1 log.debug("sge.CreateCell.restartSge Doing RESTART SGE: %s (%s)", node.alias, node.private_ip_address) #### KILL ANY LINGERING TERMINATED PROCESSES killall = "/bin/ps aux | grep sgeadmin | cut -c9-14 | xargs -n1 -iPID /bin/kill -9 PID &> /dev/null" log.debug(killall) node.ssh.execute(killall, True, False, True) killall = "/bin/ps aux | grep root | grep sge | cut -c9-14 | xargs -n1 -iPID /bin/kill -9 PID &> /dev/null" log.debug(killall) node.ssh.execute(killall, True, False, True) log.debug("sge.CreateCell.restartSge node.alias: %s", node.alias) if node.alias == "master": time.sleep(float(sleep)) log.debug("sge.CreateCell.restartSge %s", start_qmaster) node.ssh.execute(start_qmaster) log.debug("sge.CreateCell.restartSge %s", start_execd) node.ssh.execute(start_execd)
def __init__(self, privatekey, publiccert, cell, execdport, qmasterport, root, slots): log.info("Loaded plugin: sge.CreateCell") log.debug("sge.CreateCell.__init__ Initialising CreateCell plugin.") log.debug("sge.CreateCell.__init__ privatekey %s" % privatekey) log.debug("sge.CreateCell.__init__ publiccert %s" % publiccert) log.debug("sge.CreateCell.__init__ cell %s" % cell) log.debug("sge.CreateCell.__init__ execdport %s" % execdport) log.debug("sge.CreateCell.__init__ qmasterport %s" % qmasterport) log.debug("sge.CreateCell.__init__ root %s" % root) log.debug("sge.CreateCell.__init__ slots %s" % slots) self.headgroup = "default" self.privatekey = privatekey self.publiccert = publiccert self.cell = cell self.execdport = execdport self.qmasterport = qmasterport self.root = root self.slots = slots #""" SET HEAD NODE'S ROOT PATH TO SGE BINARIES """ #rootpath = os.environ['ROOTPATH']; #rootpath = re.sub(r'^.', '', rootpath) #log.info("rootpath: %s", rootpath) #self.rootpath = rootpath os.environ['SGE_ROOT'] = root os.environ['SGE_CELL'] = cell os.environ['SGE_QMASTER_PORT'] = qmasterport os.environ['SGE_EXECD_PORT'] = execdport
def openSgePorts(self): """ Open the particular SGE qmaster and execd daemon ports for this cluster """ log.info("Opening SGE qmaster and execd ports") qmasterport = self.qmasterport execdport = self.execdport cluster = self.cell envars = self.exportEnvironmentVars() log.debug("sge.CreateCell.openSgePorts qmasterport; %s", qmasterport) log.debug("sge.CreateCell.openSgePorts execdport; %s", execdport) log.debug("sge.CreateCell.openSgePorts envars; %s", envars) #### SET EC2 KEY FILE ENVIRONMENT VARIABLES ec2vars = "export EC2_PRIVATE_KEY=" + self.privatekey + "; " ec2vars += "export EC2_CERT=" + self.publiccert + "; " # HEAD NODE (I.E., NOT MASTER OR NODE) commands = [ ec2vars + 'ec2-authorize @sc-' + cluster + ' -p ' + execdport + ' -P tcp', ec2vars + 'ec2-authorize @sc-' + cluster + ' -p ' + execdport + ' -P udp', ec2vars + 'ec2-authorize @sc-' + cluster + ' -p ' + qmasterport + ' -P tcp', ec2vars + 'ec2-authorize @sc-' + cluster + ' -p ' + qmasterport + ' -P udp', ec2vars + 'ec2-authorize ' + self.headgroup + ' -p ' + execdport + ' -P tcp', ec2vars + 'ec2-authorize ' + self.headgroup + ' -p ' + execdport + ' -P udp', ec2vars + 'ec2-authorize ' + self.headgroup + ' -p ' + qmasterport + ' -P tcp', ec2vars + 'ec2-authorize ' + self.headgroup + ' -p ' + qmasterport + ' -P udp' ] for command in commands: self.runSystemCommand(command);
def execute(self, args): if len(args) != 3: self.parser.error('you must specify an instance-id, image name, and bucket') instanceid, image_name, bucket = args self.bucket = bucket self.image_name = image_name cfg = self.cfg ec2 = cfg.get_easy_ec2() i = ec2.get_instance(instanceid) if not self.opts.confirm: for group in i.groups: if group.id.startswith(static.SECURITY_GROUP_PREFIX): log.warn("Instance %s is a StarCluster instance" % i.id) print log.warn("Creating an image from a StarCluster instance " + \ "can lead to problems when attempting to use the resulting " + \ "image with StarCluster later on") print log.warn( "The recommended way to re-image a StarCluster AMI is " + \ "to launch a single instance using either ElasticFox, the " +\ "EC2 command line tools, or the AWS management console. " +\ "Then login to the instance, modify it, and use this " + \ "command to create a new AMI from it.") print resp = raw_input("Continue anyway (y/n)? ") if resp not in ['y','Y','yes']: log.info("Aborting...") sys.exit(1) break self.catch_ctrl_c() ami_id = image.create_image(instanceid, image_name, bucket, cfg, **self.specified_options_dict) log.info("Your new AMI id is: %s" % ami_id)
def update_dns(self, host_name, ip_address): ttl = 10 host_name = ".".join([host_name, self.domain]) conn = boto.connect_route53() response = conn.get_all_rrsets(self.hosted_zone_id, 'A', host_name, maxitems=1) if len(response): response = response[0] comment = "Starcluster route53 plugin deleted record for %s"%(host_name) changes = ResourceRecordSets(conn, self.hosted_zone_id, comment) change1 = changes.add_change("DELETE", host_name, 'A', response.ttl) for old_value in response.resource_records: change1.add_value(old_value) try: changes.commit() log.info(comment) except Exception as e: log.warning(e) comment = "Starcluster route53 plugin updated record for %s to %s"%(host_name, ip_address) changes = ResourceRecordSets(conn, self.hosted_zone_id, comment) change2 = changes.add_change("CREATE", host_name, 'A', ttl) change2.add_value(ip_address) try: changes.commit() log.info(comment) except Exception as e: log.warning(e)
def run(self, nodes, master, user, user_shell, volumes): self._check_ipython_installed(master) user_home = master.getpwnam(user).pw_dir profile_dir = posixpath.join(user_home, '.ipython', 'profile_default') master.ssh.switch_user(user) self._write_config(master, user, profile_dir) # Start the cluster and some engines on the master (leave 1 # processor free to handle cluster house keeping) cfile, n_engines_master = self._start_cluster(master, profile_dir) # Start engines on each of the non-master nodes non_master_nodes = [node for node in nodes if not node.is_master()] for node in non_master_nodes: self.pool.simple_job( _start_engines, (node, user, node.num_processors), jobid=node.alias) n_engines_non_master = sum(node.num_processors for node in non_master_nodes) if len(non_master_nodes) > 0: log.info("Adding %d engines on %d nodes", n_engines_non_master, len(non_master_nodes)) self.pool.wait(len(non_master_nodes)) if self.enable_notebook: self._start_notebook(master, user, profile_dir) n_engines_total = n_engines_master + n_engines_non_master log.info(STARTED_MSG % dict(cluster=master.parent_cluster, user=user, connector_file=cfile, key_location=master.key_location, n_engines=n_engines_total, n_nodes=len(nodes))) master.ssh.switch_user('root')
def execute(self, args): if len(args) != 1: self.parser.error("please specify a <tag_name> for this cluster") cfg = self.cfg use_experimental = cfg.globals.get('enable_experimental') if self.opts.spot_bid is not None and not use_experimental: raise exception.ExperimentalFeature('Using spot instances') tag = self.tag = args[0] template = self.opts.cluster_template if not template: template = cfg.get_default_cluster_template(tag) log.info("Using default cluster template: %s" % template) cluster_exists = cluster.cluster_exists(tag, cfg) create = not self.opts.no_create if not cluster_exists and not create: raise exception.ClusterDoesNotExist(tag) scluster = cfg.get_cluster_template(template, tag) scluster.update(self.specified_options_dict) validate_running = self.opts.no_create validate_only = self.opts.validate_only try: scluster._validate(validate_running=validate_running) if validate_only: return except exception.ClusterValidationError,e: log.error('settings for cluster template "%s" are not valid:' % template) raise
def setMasterEtcHosts (self, master): log.info("Adding master hostname to own /etc/hosts") envars = self.exportEnvironmentVars() command = "cat /etc/hosts" log.debug("sge.CreateCell.setMasterEtcHosts command: %s" % command) etchosts = etchosts_template ip_address = master.ip_address dns_name = master.dns_name insert = master.private_ip_address insert += "\t" insert += self.getHostname(master) insert += "\t" insert += "localhost" etchosts += insert + "\n" log.debug("sge.CreateCell.setMasterEtcHosts AFTER etchosts: %s", etchosts) etchosts_file = master.ssh.remote_file("/etc/hosts") print >> etchosts_file, etchosts etchosts_file.close() # DEPRECATED: #command = "/etc/init.d/networking restart" command = "sh -c \"ifdown eth0 && ifup eth0\"" log.debug("sge.CreateCell.setMasterEtcHosts command: %s", command) result = master.ssh.execute(command) log.debug("sge.CreateCell.setMasterEtcHosts result: %s", result)
def run(self, nodes, master, user, user_shell, volumes): # set up some paths repo_dir = get_repo_dir(user) setup_script = get_setup_script(user) for node in nodes: # NOTE: nodes includes master log.info("Installing %s as root on %s" % (project_name, node.alias)) # cmd_strs = [ # FIXME: do this somewhere else 'pip install pyparsing==2.0.1', 'pip install patsy', 'pip install statsmodels', 'rm -rf %s' % repo_dir, 'git clone %s %s' % (repo_url, repo_dir), 'python %s develop' % setup_script, # 'python %s build_ext --inplace' % setup_script, 'chown -R %s %s' % (user, repo_dir), ] for cmd_str in cmd_strs: node.ssh.execute(cmd_str + ' >out 2>err') pass pass for node in nodes: log.info("Setting up %s as %s on %s" % (project_name, user, node.alias)) # cmd_strs = [ 'mkdir -p ~/.matplotlib', 'echo backend: Agg > ~/.matplotlib/matplotlibrc', ] for cmd_str in cmd_strs: node.shell(user=user, command=cmd_str) pass pass return
def enableSchedulingInfo(self): """ Enable job scheduling info output for 'qstat -j' """ log.info("Enabling job scheduling info") envars = self.exportEnvironmentVars() log.debug(envars + self.rootpath + "/qconf -ssconf") queue_template = subprocess.Popen(envars + self.rootpath + "/qconf -ssconf", stdout=subprocess.PIPE, shell=True).stdout.read() log.debug("sge.CreateCell.enableSchedulingInfo BEFORE queue_template: %s", queue_template) match = "schedd_job_info false" insert = "schedd_job_info true" queue_template = string.replace(queue_template, match, insert) log.debug("sge.CreateCell.enableSchedulingInfo AFTER queue_template: %s", queue_template) pid = os.getpid() filename = "/tmp/queue-" + str(os.getpid()) + ".txt" queue_file = open(filename, 'w') print >> queue_file, queue_template queue_file.close() cmd = envars + self.rootpath + "/qconf -Msconf " + filename log.debug(cmd) os.system(cmd) remove = "rm -fr " + filename log.debug(remove) os.system(remove)
def run(self, cluster): """ This function will loop indefinitely, using SGELoadBalancer.get_stats() to get the clusters status. It looks at the job queue and tries to decide whether to add or remove a node. It should later look at job durations (currently doesn't) """ self._cluster = cluster if self.max_nodes is None: self.max_nodes = cluster.cluster_size use_default_stats_file = self.dump_stats and not self.stats_file use_default_plots_dir = self.plot_stats and not self.plot_output_dir if use_default_stats_file or use_default_plots_dir: self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True) if not self.stats_file: self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag if not self.plot_output_dir: self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag if not cluster.is_cluster_up(): raise exception.ClusterNotRunning(cluster.cluster_tag) if self.dump_stats: if os.path.isdir(self.stats_file): raise exception.BaseException("stats file destination '%s' is" " a directory" % self.stats_file) sfdir = os.path.dirname(os.path.abspath(self.stats_file)) self._validate_dir(sfdir, msg_prefix="stats file destination") if self.plot_stats: if os.path.isfile(self.plot_output_dir): raise exception.BaseException("plot output destination '%s' " "is a file" % self.plot_output_dir) self._validate_dir(self.plot_output_dir, msg_prefix="plot output destination") raw = dict(__raw__=True) log.info("Starting load balancer (Use ctrl-c to exit)") log.info("Maximum cluster size: %d" % self.max_nodes, extra=raw) log.info("Minimum cluster size: %d" % self.min_nodes, extra=raw) log.info("Cluster growth rate: %d nodes/iteration\n" % self.add_nodes_per_iteration, extra=raw) if self.dump_stats: log.info("Writing stats to file: %s" % self.stats_file) if self.plot_stats: log.info("Plotting stats to directory: %s" % self.plot_output_dir) while (self._keep_polling): if not cluster.is_cluster_up(): log.info("Waiting for all nodes to come up...") time.sleep(self.polling_interval) continue self.get_stats() log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw) log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()), extra=raw) oldest_queued_job_age = self.stat.oldest_queued_job_age() if oldest_queued_job_age: log.info("Oldest queued job: %s" % oldest_queued_job_age, extra=raw) log.info("Avg job duration: %d secs" % self.stat.avg_job_duration(), extra=raw) log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(), extra=raw) log.info("Last cluster modification time: %s" % self.__last_cluster_mod_time.strftime("%Y-%m-%d %X"), extra=dict(__raw__=True)) #evaluate if nodes need to be added self._eval_add_node() #evaluate if nodes need to be removed self._eval_remove_node() if self.dump_stats or self.plot_stats: self.stat.write_stats_to_csv(self.stats_file) #call the visualizer if self.plot_stats: try: self.visualizer.graph_all() except IOError, e: raise exception.BaseException(str(e)) #evaluate if cluster should be terminated if self.kill_cluster: if self._eval_terminate_cluster(): log.info("Terminating cluster and exiting...") return self._cluster.terminate_cluster() log.info("Sleeping...(looping again in %d secs)\n" % self.polling_interval) time.sleep(self.polling_interval)
def _validate_required_progs(self, progs): log.info("Checking for required remote commands...") self._instance.ssh.check_required(progs)
def _attach_volume(self, vol, instance_id, device): log.info("Attaching volume %s to instance %s..." % (vol.id, instance_id)) vol.attach(instance_id, device) self.ec2.wait_for_volume(vol, state='attached') return self._volume
def _create_snapshot(self, volume): snap = self.ec2.create_snapshot(volume, wait_for_snapshot=True) log.info("New snapshot id: %s" % snap.id) self._snapshot = snap return snap
def _create_volume(self, size, zone, snapshot_id=None): vol = self.ec2.create_volume(size, zone, snapshot_id) self._volume = vol log.info("New volume id: %s" % vol.id) self.ec2.wait_for_volume(vol, status='available') return vol
def generate_key_for_user(self, username, ignore_existing=False, auth_new_key=False, auth_conn_key=False): """ Generates an id_rsa/id_rsa.pub keypair combo for a user on the remote machine. ignore_existing - if False, any existing key combos will be used rather than generating a new RSA key auth_new_key - if True, add the newly generated public key to the remote user's authorized_keys file auth_conn_key - if True, add the public key used to establish this ssh connection to the remote user's authorized_keys """ user = self.getpwnam(username) home_folder = user.pw_dir ssh_folder = posixpath.join(home_folder, '.ssh') if not self.ssh.isdir(ssh_folder): self.ssh.mkdir(ssh_folder) private_key = posixpath.join(ssh_folder, 'id_rsa') public_key = private_key + '.pub' authorized_keys = posixpath.join(ssh_folder, 'authorized_keys') key_exists = self.ssh.isfile(private_key) if key_exists and not ignore_existing: log.info("Using existing key: %s" % private_key) key = self.ssh.load_remote_rsa_key(private_key) else: key = self.ssh.generate_rsa_key() pubkey_contents = self.ssh.get_public_key(key) if not key_exists or ignore_existing: # copy public key to remote machine pub_key = self.ssh.remote_file(public_key, 'w') pub_key.write(pubkey_contents) pub_key.chown(user.pw_uid, user.pw_gid) pub_key.chmod(0400) pub_key.close() # copy private key to remote machine priv_key = self.ssh.remote_file(private_key, 'w') key.write_private_key(priv_key) priv_key.chown(user.pw_uid, user.pw_gid) priv_key.chmod(0400) priv_key.close() if not auth_new_key or not auth_conn_key: return key auth_keys_contents = '' if self.ssh.isfile(authorized_keys): auth_keys = self.ssh.remote_file(authorized_keys, 'r') auth_keys_contents = auth_keys.read() auth_keys.close() auth_keys = self.ssh.remote_file(authorized_keys, 'a') if auth_new_key: # add newly generated public key to user's authorized_keys if pubkey_contents not in auth_keys_contents: log.debug("adding auth_key_contents") auth_keys.write('%s\n' % pubkey_contents) if auth_conn_key and self.ssh._pkey: # add public key used to create the connection to user's # authorized_keys conn_key = self.ssh._pkey conn_pubkey_contents = self.ssh.get_public_key(conn_key) if conn_pubkey_contents not in auth_keys_contents: log.debug("adding conn_pubkey_contents") auth_keys.write('%s\n' % conn_pubkey_contents) auth_keys.chown(user.pw_uid, user.pw_gid) auth_keys.chmod(0600) auth_keys.close() return key
def clean_cluster(self, nodes, master, user, user_shell, volumes): """ Run qhost to find nodes that are present in OGS but not in the cluster in order to remove them. """ self._master = master self._nodes = nodes qhost_xml = master.ssh.execute("qhost -xml", source_profile=True) qhost_et = ET.fromstringlist(qhost_xml) qhosts = [] for host in qhost_et: h_name = host.attrib['name'] if h_name != 'global': qhosts.append(h_name) if len(qhosts) == 0: log.info("Nothing to clean") alive_nodes = [node.alias for node in nodes] cleaned = [] # find dead hosts for node_alias in qhosts: if node_alias not in alive_nodes: cleaned.append(node_alias) # find jobs running in dead hosts qstats_xml = self._master.ssh.execute("qstat -u \"*\" -xml", source_profile=True) qstats_xml[1:] # remove first line qstats_et = ET.fromstringlist(qstats_xml) to_delete = [] to_repair = [] cleaned_queue = [] # not a lambda function to allow pickling for c in cleaned: cleaned_queue.append("all.q@" + c) for job_list in qstats_et.find("queue_info").findall("job_list"): if job_list.find("queue_name").text in cleaned_queue: job_number = job_list.find("JB_job_number").text to_delete.append(job_number) for job_list in qstats_et.find("job_info").findall("job_list"): if job_list.find("state").text == "Eqw": job_number = job_list.find("JB_job_number").text to_repair.append(job_number) # delete the jobs if to_delete: log.info("Stopping jobs: " + str(to_delete)) self._master.ssh.execute("qdel -f " + " ".join(to_delete)) time.sleep(3) # otherwise might provoke LOST QRSH if on last job if to_repair: log.error("Reseting jobs: " + str(to_repair)) self._master.ssh.execute("qmod -cj " + " ".join(to_repair), ignore_exit_status=True) # stuck qrsh issue ps_wc = int(self._master.ssh.execute("ps -ef | grep qrsh | wc -l")[0]) qstat_wc = int(self._master.ssh.execute("qstat -u \"*\" | wc -l")[0]) if qstat_wc == 0 and ps_wc > 2: log.error("LOST QRSH??") log.error("pkill -9 qrsh") self._master.ssh.execute("pkill -9 qrsh", ignore_exit_status=True) # ---------------------------------- # delete the host config for c in cleaned: log.info("Cleaning node " + c) if len(master.ssh.get_remote_file_lines("/etc/hosts", c)) == 0: log.warn(c + " is missing from /etc/hosts, creating a dummy " "entry 1.1.1.1") rfile = master.ssh.remote_file("/etc/hosts", 'a') rfile.write("1.1.1.1 " + c + "\n") rfile.close() try: self._remove_from_sge(DeadNode(c), only_clean_master=True) except RemoteCommandFailed: log.warning("Failed to remove node {} from sge." .format(c), exc_info=True) # fix to allow pickling self._master = None self._nodes = None
def run(self, nodes, master, user, shell, volumes): secretword = self._generate_secretword() aliases = map(lambda x: x.alias, nodes) for node in nodes: log.info("Installing mpich2 on node %s" % node.alias) node.ssh.execute("apt-get -y install mpich2") log.info("Configuring %s on node %s" % (self.MPD_HOSTS, node.alias)) mpd_hosts = node.ssh.remote_file(self.MPD_HOSTS, 'w') mpd_hosts.write('\n'.join(aliases)) mpd_hosts.close() log.info("Configuring %s on node %s for root" % (self.MPD_CONF, node.alias)) mpd_conf = node.ssh.remote_file(self.MPD_CONF, 'w') mpd_conf.write("secretword=%s\n" % secretword) mpd_conf.chmod(0600) mpd_conf.close() user_home = master.getpwnam(user).pw_dir user_mpd_conf = posixpath.join(user_home, '.mpd.conf') log.info("Configuring %s for user %s" % (user_mpd_conf, user)) secretword = self._generate_secretword() umpdconf = node.ssh.remote_file(user_mpd_conf) umpdconf.write("secretword=%s\n" % secretword) umpdconf.chmod(0600) umpdconf.close() log.info("Launching mpdboot for root") master.ssh.execute('mpdboot -f %s -n %d' % (self.MPD_HOSTS, len(nodes))) log.info("Launching mpdboot for user %s" % user) master.ssh.execute("su -l -c 'mpdboot -f %s -n %d' %s" % \ (self.MPD_HOSTS, len(nodes), user))
def on_remove_node(self, node, nodes, master, user, user_shell, volumes): log.info("Removing %s from ipcluster" % node.alias) less_nodes = filter(lambda x: x.id != node.id, nodes) self._create_cluster_file(master, less_nodes) node.ssh.execute('pkill ipengine')
def run(self, nodes, master, user, shell, volumes): aliases = [n.alias for n in nodes] log.info("Installing SBT") for node in nodes: self.pool.simple_job(self._install_sbt, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing S3 and Boto") for node in nodes: self.pool.simple_job(self._install_s3_and_boto, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing Cython, IPython Notebook, py4j and Matplotlib") for node in nodes: self.pool.simple_job(self._install_ipython_notebook, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("installing GEOS") for node in nodes: self.pool.simple_job(self._install_geos, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing Basemap") for node in nodes: self.pool.simple_job(self._install_basemap, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing PyProj") for node in nodes: self.pool.simple_job(self._install_pyproj, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing PyGrib") for node in nodes: self.pool.simple_job(self._install_pygrib, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing Pydoop") for node in nodes: self.pool.simple_job(self._install_pydoop, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing HDF5 and Libcurl") for node in nodes: self.pool.simple_job(self._install_hdf5, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing NetCDF-4-C") for node in nodes: self.pool.simple_job(self._install_netcdf, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Installing PyNetCDF4") for node in nodes: self.pool.simple_job(self._install_pynetcdf4, (node), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Configuring IPython PySpark profile and startup script") self._configure_ipython(master) log.info("Opening port for IPython Notebook") self._open_ports(master) log.info("Don't forget to configure s3cmd with s3cmd --configure!")
def on_remove_node(self, node, nodes, master, user, user_shell, volumes): self._master = master log.info("No need to remove %s from EFS" % node.alias)
def _generate_secretword(self): log.info("Generating MPICH secretword") secretword = map(lambda x: x, string.ascii_lowercase + string.digits) random.shuffle(secretword) return ''.join(secretword)
class CmdStart(ClusterCompleter): """ start [options] <cluster_tag> Start a new cluster Example: $ starcluster start mynewcluster This will launch a cluster named "mynewcluster" using the settings from the default cluster template defined in the configuration file. The default cluster template is specified by the 'default_template' option in the [global] section of the config. To use another template besides the default use the -c (--cluster-template) option: $ starcluster start -c largecluster mynewcluster This will launch a cluster named "mynewcluster" using the settings from the "largecluster" cluster template instead of the default template. """ names = ['start'] def addopts(self, parser): templates = [] if self.cfg: templates = self.cfg.clusters.keys() parser.add_option("-x", "--no-create", dest="no_create", action="store_true", default=False, help="do not launch new EC2 instances when " "starting cluster (use existing instances instead)") parser.add_option("-o", "--create-only", dest="create_only", action="store_true", default=False, help="only launch/start EC2 instances, " "do not perform any setup routines") parser.add_option("-v", "--validate-only", dest="validate_only", action="store_true", default=False, help="only validate cluster settings, do " "not start a cluster") parser.add_option("-V", "--skip-validation", dest="validate", action="store_false", default=True, help="do not validate cluster settings") parser.add_option("-l", "--login-master", dest="login_master", action="store_true", default=False, help="login to master node after launch") parser.add_option("-q", "--disable-queue", dest="disable_queue", action="store_true", default=None, help="do not configure a queueing system (SGE)") parser.add_option("-Q", "--enable-queue", dest="disable_queue", action="store_false", default=None, help="configure a queueing system (SGE) (default)") parser.add_option("--force-spot-master", dest="force_spot_master", action="store_true", default=None, help="when creating a spot cluster " "the default is to launch the master as " "a flat-rate instance for stability. this option " "forces launching the master node as a spot " "instance when a spot cluster is requested.") parser.add_option("--no-spot-master", dest="force_spot_master", action="store_false", default=None, help="Do not launch the master node as a spot " "instance when a spot cluster is requested. " "(default)") parser.add_option("--public-ips", dest="public_ips", default=None, action='store_true', help="Assign public IPs to all VPC nodes " "(VPC clusters only)"), parser.add_option("--no-public-ips", dest="public_ips", default=None, action='store_false', help="Do NOT assign public ips to all VPC nodes " "(VPC clusters only) (default)"), opt = parser.add_option("-c", "--cluster-template", action="store", dest="cluster_template", choices=templates, default=None, help="cluster template to use " "from the config file") if completion: opt.completer = completion.ListCompleter(opt.choices) parser.add_option("-r", "--refresh-interval", dest="refresh_interval", type="int", action="callback", default=None, callback=self._positive_int, help="refresh interval when waiting for cluster " "nodes to come up (default: 30)") parser.add_option("-b", "--bid", dest="spot_bid", action="store", type="float", default=None, help="requests spot instances instead of flat " "rate instances. Uses SPOT_BID as max bid for " "the request.") parser.add_option("-d", "--description", dest="cluster_description", action="store", type="string", default="Cluster requested at %s" % time.strftime("%Y%m%d%H%M"), help="brief description of cluster") parser.add_option("-s", "--cluster-size", dest="cluster_size", action="callback", type="int", default=None, callback=self._positive_int, help="number of ec2 instances to launch") parser.add_option("-u", "--cluster-user", dest="cluster_user", action="store", type="string", default=None, help="name of user to create on cluster " "(defaults to sgeadmin)") opt = parser.add_option("-S", "--cluster-shell", dest="cluster_shell", action="store", choices=static.AVAILABLE_SHELLS.keys(), default=None, help="shell for cluster user " "(defaults to bash)") if completion: opt.completer = completion.ListCompleter(opt.choices) parser.add_option("-m", "--master-image-id", dest="master_image_id", action="store", type="string", default=None, help="AMI to use when launching master") parser.add_option("-n", "--node-image-id", dest="node_image_id", action="store", type="string", default=None, help="AMI to use when launching nodes") parser.add_option("-I", "--master-instance-type", dest="master_instance_type", action="store", choices=sorted(static.INSTANCE_TYPES.keys()), default=None, help="instance type for the master " "instance") opt = parser.add_option("-i", "--node-instance-type", dest="node_instance_type", action="store", choices=sorted(static.INSTANCE_TYPES.keys()), default=None, help="instance type for the node instances") if completion: opt.completer = completion.ListCompleter(opt.choices) parser.add_option("-a", "--availability-zone", dest="availability_zone", action="store", type="string", default=None, help="availability zone to launch instances in") parser.add_option("-k", "--keyname", dest="keyname", action="store", type="string", default=None, help="name of the keypair to use when " "launching the cluster") parser.add_option("-K", "--key-location", dest="key_location", action="store", type="string", default=None, metavar="FILE", help="path to an ssh private key that matches the " "cluster keypair") parser.add_option("-U", "--userdata-script", dest="userdata_scripts", action="append", default=None, metavar="FILE", help="Path to userdata script that will run on " "each node on start-up. Can be used multiple times.") parser.add_option("-P", "--dns-prefix", dest="dns_prefix", action='store_true', help="Prefix dns names of all nodes in the cluster " "with the cluster tag") parser.add_option("-p", "--no-dns-prefix", dest="dns_prefix", action='store_false', help="Do NOT prefix dns names of all nodes in the " "cluster with the cluster tag (default)") # This option is disabled because we need to use nargs='+' which is # supported by argparse but not optparse. Use cluster template # configuration key SUBNET_IDS instead. # parser.add_option("-N", "--subnet-id", dest="subnet_id", # action="store", type="string", # help=("Launch cluster into a VPC subnet")) parser.add_option("--config-on-master", default=False, action='store_true', help="Store the config on the " "master node rather than into the security group " "tags") parser.add_option("--dns-sufix", action='store_true', help="Sufix dns names of all nodes in the cluster " "with the cluster tag.") def execute(self, args): if len(args) != 1: self.parser.error("please specify a <cluster_tag>") tag = args[0] if tag.find("master") > -1: # Because of Node.is_master raise exception.ClusterValidationError("Cluster name cannot " "contain master") create = not self.opts.no_create scluster = self.cm.get_cluster_group_or_none(tag) if scluster and create: scluster = self.cm.get_cluster(tag, group=scluster, load_receipt=False, require_keys=False) stopped_ebs = scluster.is_cluster_stopped() is_ebs = False if not stopped_ebs: is_ebs = scluster.is_ebs_cluster() raise exception.ClusterExists(tag, is_ebs=is_ebs, stopped_ebs=stopped_ebs) if not create and not scluster: raise exception.ClusterDoesNotExist(tag) create_only = self.opts.create_only validate = self.opts.validate validate_running = self.opts.no_create validate_only = self.opts.validate_only config_on_master = self.opts.config_on_master if scluster: if config_on_master: scluster = self.cm.get_cluster(tag, group=scluster, load_receipt=False) validate_running = False else: scluster = self.cm.get_cluster(tag, group=scluster) validate_running = True else: template = self.opts.cluster_template if not template: try: template = self.cm.get_default_cluster_template() except exception.NoDefaultTemplateFound, e: try: ctmpl = e.options[0] except IndexError: ctmpl = "smallcluster" e.msg += " \n\nAlternatively, you can specify a cluster " e.msg += "template to use by passing the '-c' option to " e.msg += "the 'start' command, e.g.:\n\n" e.msg += " $ starcluster start -c %s %s" % (ctmpl, tag) raise e log.info("Using default cluster template: %s" % template) scluster = self.cm.get_cluster_template(template, tag) scluster.update(self.specified_options_dict) if self.opts.keyname and not self.opts.key_location: key = self.cfg.get_key(self.opts.keyname) scluster.key_location = key.key_location if not self.opts.refresh_interval: interval = self.cfg.globals.get("refresh_interval") if interval is not None: scluster.refresh_interval = interval if self.opts.spot_bid is not None and not self.opts.no_create: msg = user_msgs.spotmsg % { 'size': scluster.cluster_size, 'tag': tag } if not validate_only and not create_only: self.warn_experimental(msg, num_secs=5) if self.opts.dns_prefix: if tag.find(".") > -1: raise exception.ClusterValidationError( "Cannot use --dns-prefix when the cluster tag contains " "a dot.") scluster.dns_prefix = tag if self.opts.dns_sufix: scluster.dns_sufix = tag if config_on_master: scluster.config_on_master = True if self.opts.no_create: validate = False log.warning("Cannot start a cluster when its config is " "stored on the master node using StarCluster. " "You should start it manually and then use " "the recovery options.") return try: scluster.start(create=create, create_only=create_only, validate=validate, validate_only=validate_only, validate_running=validate_running, save_config_on_master=self.opts.config_on_master) except KeyboardInterrupt: if validate_only: raise else: raise exception.CancelledStartRequest(tag) if validate_only: return if not create_only and not self.opts.login_master: log.info(user_msgs.cluster_started_msg % dict(tag=scluster.cluster_tag), extra=dict(__textwrap__=True, __raw__=True)) if self.opts.login_master: scluster.ssh_to_master()
def run(self, nodes, master, user, user_shell, volumes): self._create_cluster_file(master, nodes) log.info("Starting ipcluster...") master.ssh.execute( "su - %s -c 'screen -d -m ipcluster ssh --clusterfile %s'" % \ (user, self.cluster_file))
class CmdStart(ClusterCompleter): """ start [options] <cluster_tag> Start a new cluster Example: $ starcluster start mynewcluster This will launch a cluster named "mynewcluster" using the settings from the default cluster template defined in the configuration file. The default cluster template is specified by the 'default_template' option in the [global] section of the config. To use another template besides the default use the -c (--cluster-template) option: $ starcluster start -c largecluster mynewcluster This will launch a cluster named "mynewcluster" using the settings from the "largecluster" cluster template instead of the default template. """ names = ['start'] tag = None def addopts(self, parser): templates = [] if self.cfg: templates = self.cfg.get_cluster_names().keys() parser.add_option("-x", "--no-create", dest="no_create", action="store_true", default=False, help="do not launch new EC2 instances when " "starting cluster (use existing instances instead)") parser.add_option("-o", "--create-only", dest="create_only", action="store_true", default=False, help="only launch/start EC2 instances, " "do not perform any setup routines") parser.add_option("-v", "--validate-only", dest="validate_only", action="store_true", default=False, help="only validate cluster settings, do " "not start a cluster") parser.add_option("-V", "--skip-validation", dest="validate", action="store_false", default=True, help="do not validate cluster settings") parser.add_option("-l", "--login-master", dest="login_master", action="store_true", default=False, help="login to master node after launch") parser.add_option("-q", "--disable-queue", dest="disable_queue", action="store_true", default=None, help="do not configure a queueing system (SGE)") parser.add_option("--force-spot-master", dest="force_spot_master", action="store_true", default=None, help="when creating a spot cluster " "the default is to launch the master as " "a flat-rate instance for stability. this option " "forces launching the master node as a spot " "instance when a spot cluster is requested.") opt = parser.add_option("-c", "--cluster-template", action="store", dest="cluster_template", choices=templates, default=None, help="cluster template to use " "from the config file") if optcomplete: opt.completer = optcomplete.ListCompleter(opt.choices) parser.add_option("-r", "--refresh-interval", dest="refresh_interval", type="int", action="callback", default=None, callback=self._positive_int, help="refresh interval when waiting for cluster " "nodes to come up (default: 30)") parser.add_option("-b", "--bid", dest="spot_bid", action="store", type="float", default=None, help="requests spot instances instead of flat " "rate instances. Uses SPOT_BID as max bid for " "the request.") parser.add_option("-d", "--description", dest="cluster_description", action="store", type="string", default="Cluster requested at %s" % time.strftime("%Y%m%d%H%M"), help="brief description of cluster") parser.add_option("-s", "--cluster-size", dest="cluster_size", action="callback", type="int", default=None, callback=self._positive_int, help="number of ec2 instances to launch") parser.add_option("-u", "--cluster-user", dest="cluster_user", action="store", type="string", default=None, help="name of user to create on cluster " "(defaults to sgeadmin)") opt = parser.add_option("-S", "--cluster-shell", dest="cluster_shell", action="store", choices=static.AVAILABLE_SHELLS.keys(), default=None, help="shell for cluster user " "(defaults to bash)") if optcomplete: opt.completer = optcomplete.ListCompleter(opt.choices) parser.add_option("-m", "--master-image-id", dest="master_image_id", action="store", type="string", default=None, help="AMI to use when launching master") parser.add_option("-n", "--node-image-id", dest="node_image_id", action="store", type="string", default=None, help="AMI to use when launching nodes") parser.add_option("-I", "--master-instance-type", dest="master_instance_type", action="store", choices=static.INSTANCE_TYPES.keys(), default=None, help="instance type for the master instance") opt = parser.add_option("-i", "--node-instance-type", dest="node_instance_type", action="store", choices=static.INSTANCE_TYPES.keys(), default=None, help="instance type for the node instances") if optcomplete: opt.completer = optcomplete.ListCompleter(opt.choices) parser.add_option("-a", "--availability-zone", dest="availability_zone", action="store", type="string", default=None, help="availability zone to launch instances in") parser.add_option("-k", "--keyname", dest="keyname", action="store", type="string", default=None, help="name of the keypair to use when " "launching the cluster") parser.add_option("-K", "--key-location", dest="key_location", action="store", type="string", default=None, metavar="FILE", help="path to an ssh private key that matches the " "cluster keypair") def cancel_command(self, signum, frame): raise exception.CancelledStartRequest(self.tag) def execute(self, args): if len(args) != 1: self.parser.error("please specify a <cluster_tag>") tag = self.tag = args[0] create = not self.opts.no_create create_only = self.opts.create_only scluster = self.cm.get_cluster_or_none(tag, require_keys=False) validate = self.opts.validate validate_running = self.opts.no_create validate_only = self.opts.validate_only if scluster and create: stopped_ebs = scluster.is_cluster_stopped() is_ebs = False if not stopped_ebs: is_ebs = scluster.is_ebs_cluster() raise exception.ClusterExists(tag, is_ebs=is_ebs, stopped_ebs=stopped_ebs) if not scluster and not create: raise exception.ClusterDoesNotExist(tag) elif scluster: validate_running = True else: template = self.opts.cluster_template if not template: try: template = self.cm.get_default_cluster_template() except exception.NoDefaultTemplateFound, e: try: ctmpl = e.options[0] except IndexError: ctmpl = "smallcluster" e.msg += " \n\nAlternatively, you can specify a cluster " e.msg += "template to use by passing the '-c' option to " e.msg += "the 'start' command, e.g.:\n\n" e.msg += " $ starcluster start -c %s %s" % (ctmpl, tag) raise e log.info("Using default cluster template: %s" % template) scluster = self.cm.get_cluster_template(template, tag) scluster.update(self.specified_options_dict) if self.opts.keyname and not self.opts.key_location: key = self.cfg.get_key(self.opts.keyname) scluster.key_location = key.key_location if not self.opts.refresh_interval: interval = self.cfg.globals.get("refresh_interval") if interval is not None: scluster.refresh_interval = interval if self.opts.spot_bid is not None and not self.opts.no_create: msg = user_msgs.spotmsg % { 'size': scluster.cluster_size, 'tag': tag } if not validate_only and not create_only: self.warn_experimental(msg, num_secs=5) self.catch_ctrl_c() scluster.start(create=create, create_only=create_only, validate=validate, validate_only=validate_only, validate_running=validate_running) if validate_only: return if not create_only and not self.opts.login_master: log.info(user_msgs.cluster_started_msg % dict(tag=scluster.cluster_tag), extra=dict(__textwrap__=True, __raw__=True)) if self.opts.login_master: scluster.ssh_to_master()
def on_add_node(self, node, nodes, master, user, user_shell, volumes): self._master = master self._new_security_group = node.cluster_groups[0].id log.info("Adding %s to EFS" % node.alias) self._install_efs_on_node(node)
def on_add_node(self, node, nodes, master, user, user_shell, volumes): log.info("Adding %s to TMUX Control Center" % node.alias) self._add_to_tmuxcc(master, node, user='******') self._add_to_tmuxcc(master, node, user=user)
class CmdShell(CmdBase): """ shell Load an interactive IPython shell configured for starcluster development The following objects are automatically available at the prompt: cfg - starcluster.config.StarClusterConfig instance cm - starcluster.cluster.ClusterManager instance ec2 - starcluster.awsutils.EasyEC2 instance s3 - starcluster.awsutils.EasyS3 instance All StarCluster modules are automatically imported in the IPython session along with all StarCluster dependencies (e.g. boto, ssh, etc.) If the --ipcluster=CLUSTER (-p) is passed, the IPython session will be automatically be configured to connect to the remote CLUSTER using IPython's parallel interface (requires IPython 0.11+). In this mode you will have the following additional objects available at the prompt: ipcluster - starcluster.cluster.Cluster instance for the cluster ipclient - IPython.parallel.Client instance for the cluster ipview - IPython.parallel.client.view.DirectView for the cluster Here's an example of how to run a parallel map across all nodes in the cluster: [~]> ipclient.ids [0, 1, 2, 3] [~]> res = ipview.map_async(lambda x: x**30, range(8)) [~]> print res.get() [0, 1, 1073741824, 205891132094649L, 1152921504606846976L, 931322574615478515625L, 221073919720733357899776L, 22539340290692258087863249L] See IPython parallel docs for more details (http://ipython.org/ipython-doc/stable/parallel) """ names = ['shell', 'sh'] def _add_to_known_hosts(self, node): log.info("Configuring local known_hosts file") user_home = os.path.expanduser('~') khosts = os.path.join(user_home, '.ssh', 'known_hosts') if not os.path.isfile(khosts): log.warn("Unable to configure known_hosts: file does not exist") return contents = open(khosts).read() if node.dns_name not in contents: server_pkey = node.ssh.get_server_public_key() khostsf = open(khosts, 'a') if contents[-1] != '\n': khostsf.write('\n') name_entry = '%s,%s' % (node.dns_name, node.ip_address) khostsf.write(' '.join([ name_entry, server_pkey.get_name(), base64.b64encode(str(server_pkey)), '\n' ])) khostsf.close() def addopts(self, parser): parser.add_option("-p", "--ipcluster", dest="ipcluster", action="store", type="string", default=None, metavar="CLUSTER", help="configure a parallel " "IPython session on CLUSTER") def execute(self, args): local_ns = dict(cfg=self.cfg, ec2=self.ec2, s3=self.s3, cm=self.cm, starcluster=starcluster, log=log) if self.opts.ipcluster: log.info("Loading parallel IPython library") try: from IPython.parallel import Client except ImportError, e: self.parser.error( "Error loading parallel IPython:" "\n\n%s\n\n" "NOTE: IPython 0.11+ must be installed to use -p" % e) tag = self.opts.ipcluster cl = self.cm.get_cluster(tag) region = cl.master_node.region.name ipcluster_dir = os.path.join(static.STARCLUSTER_CFG_DIR, 'ipcluster') local_json = os.path.join(ipcluster_dir, "%s-%s.json" % (tag, region)) if not os.path.exists(local_json): user_home = cl.master_node.getpwnam(cl.cluster_user).pw_dir profile_dir = posixpath.join(user_home, '.ipython', 'profile_default') json = posixpath.join(profile_dir, 'security', 'ipcontroller-client.json') if cl.master_node.ssh.isfile(json): log.info("Fetching connector file from cluster...") if not os.path.exists(ipcluster_dir): os.makedirs(ipcluster_dir) cl.master_node.ssh.get(json, local_json) else: self.parser.error( "IPython json file %s does not exist locally or on " "the cluster. Make sure the ipcluster plugin has " "been executed and completed successfully.") key_location = cl.master_node.key_location self._add_to_known_hosts(cl.master_node) log.info("Loading parallel IPython client and view") rc = Client(local_json, sshkey=key_location) local_ns['Client'] = Client local_ns['ipcluster'] = cl local_ns['ipclient'] = rc local_ns['ipview'] = rc[:] modules = [(starcluster.__name__ + '.' + module, module) for module in starcluster.__all__] modules += [('boto', 'boto'), ('paramiko', 'paramiko'), ('workerpool', 'workerpool'), ('jinja2', 'jinja2'), ('pyasn1', 'pyasn1'), ('iptools', 'iptools')] for fullname, modname in modules: log.info('Importing module %s' % modname) try: __import__(fullname) local_ns[modname] = sys.modules[fullname] except ImportError, e: log.error("Error loading module %s: %s" % (modname, e))
def _configure_hadoop(self, master, nodes, user): log.info("Configuring Hadoop...") log.info("Adding user %s to hadoop group" % user) for node in nodes: self.pool.simple_job(self._setup_hadoop_user, (node, user), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) node_aliases = map(lambda n: n.alias, nodes) cfg = {'master': master.alias, 'replication': 3, 'hadoop_tmpdir': posixpath.join(self.hadoop_tmpdir, 'hadoop-${user.name}'), 'node_type': 'namenode'} log.info("Installing configuration templates...") # for node in nodes: # self.pool.simple_job(self._install_empty_conf, (node,), # jobid=node.alias) # self.pool.wait(numtasks=len(nodes)) log.info("Configuring environment...") for node in nodes: self.pool.simple_job(self._configure_env, (node,), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Configuring Core Site...") for node in nodes: self.pool.simple_job(self._configure_core, (node, cfg), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Configuring YARN Site...") for node in nodes: self.pool.simple_job(self._configure_yarn, (node, cfg), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Configuring MapReduce Site...") for node in nodes: self.pool.simple_job(self._configure_mapreduce_site, (node, cfg), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Configuring HDFS Site...") for node in nodes: if not node.is_master(): cfg.update({'node_type': 'datanode'}) self.pool.simple_job(self._configure_hdfs_site, (node, cfg), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Configuring masters file...") for node in nodes: self.pool.simple_job(self._configure_masters, (node, master), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) log.info("Configuring slaves file...") for node in nodes: self.pool.simple_job(self._configure_slaves, (node, node_aliases), jobid=node.alias) self.pool.wait(numtasks=len(nodes)) # log.info("Configuring HDFS...") # for node in nodes: # self.pool.simple_job(self._setup_hdfs, (node, user), # jobid=node.alias) # self.pool.wait(numtasks=len(nodes)) log.info("Configuring dumbo...") for node in nodes: self.pool.simple_job(self._setup_dumbo, (node,), jobid=node.alias) self.pool.wait(numtasks=len(nodes))
def on_remove_node(self, node, nodes, master, user, user_shell, volumes): log.info("Removing %s from TMUX Control Center" % node.alias) self._remove_from_tmuxcc(master, node, user='******') self._remove_from_tmuxcc(master, node, user=user)
def _setup_hadoop_user(self, node, user): log.info("Skipping setup-hadoop-user...")
def run(self, nodes, master, user, user_shell, volumes): #self._configure_hadoop(master, nodes, user) #self._start_hadoop(master, nodes) #self._open_ports(master) log.info("Job tracker status: http://%s:54311" % master.dns_name) log.info("Namenode status: http://%s:50070" % master.dns_name)
def _format_volume(self): log.info("Formatting volume...") self._instance.ssh.execute('%s -F %s' % (self._mkfs_cmd, self._device), silent=False)
def _install_empty_conf(self, node): log.info("Skipping install-empty-conf...")
def _setup_ebs_volumes(self): """ Mount EBS volumes, if specified in ~/.starcluster/config to /home """ # setup /etc/fstab on master to use block device if specified master = self._master devs = master.ssh.ls('/dev') for vol in self._volumes: vol = self._volumes[vol] vol_id = vol.get("volume_id") mount_path = vol.get('mount_path') device = vol.get("device") volume_partition = vol.get('partition') if not (vol_id and device and mount_path): log.error("missing required settings for vol %s" % vol) continue dev_exists = master.ssh.path_exists(device) if not dev_exists and device.startswith('/dev/sd'): # check for "correct" device in unpatched kernels device = device.replace('/dev/sd', '/dev/xvd') dev_exists = master.ssh.path_exists(device) if not dev_exists: log.warn("Cannot find device %s for volume %s" % (device, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This usually means there was a problem " "attaching the EBS volume to the master node") continue if not volume_partition: partitions = filter(lambda x: x.startswith(device), devs) if len(partitions) == 1: volume_partition = device elif len(partitions) == 2: volume_partition = device + '1' else: log.error( "volume has more than one partition, please specify " "which partition to use (e.g. partition=0, " "partition=1, etc.) in the volume's config") continue elif not master.ssh.path_exists(volume_partition): log.warn("Cannot find partition %s on volume %s" % (volume_partition, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This either means that the volume has not " "been partitioned or that the partition" "specified does not exist on the volume") continue log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path)) mount_map = self._master.get_mount_map() dev = mount_map.get(volume_partition) if dev: path, fstype, options = dev if path != mount_path: log.error("Volume %s is mounted on %s, not on %s" % (vol_id, path, mount_path)) else: log.info("Volume %s already mounted on %s...skipping" % (vol_id, mount_path)) continue self._master.mount_device(volume_partition, mount_path)
def on_remove_node(self, remove_node, nodes, master, user, user_shell, volumes): log.info("Removing %s from MPICH2 hosts file" % remove_node.alias) master.ssh.remove_lines_from_file(self.MPICH2_HOSTS, remove_node.alias)
def on_add_node(self, node, nodes, master, user, user_shell, volumes): log.info("Adding %s to TMUX Control Center" % node.alias)
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) except Exception: log.error("Failed to add new host", exc_info=True)
def do_shutdown(self): log.info("Shutting down server...") self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() self.server.stop = True
def on_remove_node(self, node, nodes, master, user, user_shell, volumes): log.info("Removing %s from TMUX Control Center" % node.alias)