def mount_volume_to_instance(instance, foldername):
     log.info("\t ... mounting to {0} ...".format(instance.id))
     instance.ssh.execute("sudo rm -rf {0}".format(foldername))
     instance.ssh.execute("sudo mkdir {0}".format(foldername))
     instance.ssh.execute("mount /dev/xvdh {0}".format(foldername))
     instance.ssh.execute("sudo chown scidb:scidb {0}".format(foldername))
     log.info("\t ... success ...")
Example #2
0
 def cancel_command(self, signum, frame):
     """
     Exits program with return value of 1
     """
     print
     log.info("Exiting...")
     sys.exit(1)
    def _install_combblas(self, node):
        log.info("\tInstalling CombBLAS")
        instructions = [
            "wget -O combblas.tgz %s" % self.combblas_source,
            "tar xvfz combblas.tgz",
            "rm combblas.tgz"
        ]
        self._follow_instructions(instructions, node)

        # Expects the combblas.patch file to be in the same directory as this source file
        patchfname = os.path.dirname(inspect.getsourcefile(SkylarkInstaller)) + '/combblas.patch'
        log.info(patchfname)
        node.ssh.put(patchfname, 'CombBLAS/combblas.patch')

        instructions = [
            "cd CombBLAS",
            "yes | git apply --ignore-space-change --ignore-whitespace combblas.patch",
            "rm combblas.patch",
            "cmake .",
            "make -j %s" % self.nproc,
            "cp *.so /usr/local/lib",
            "mkdir /usr/local/include/CombBLAS",
            "cp *.h /usr/local/include/CombBLAS",
            "cp *.cpp /usr/local/include/CombBLAS",
            "cp -R SequenceHeaps /usr/local/include/CombBLAS",
            "cp -R psort-1.0 /usr/local/include/CombBLAS",
            "cp -R graph500-1.2 /usr/local/include/CombBLAS",
            "cd ..",
            "rm -r CombBLAS"
        ]
        self._follow_instructions(instructions, node)
Example #4
0
 def execute(self, args):
     if "createimage" in sys.argv:
         warnings.warn("createimage is deprecated and will go away in the "
                       "next release. please use the s3image/ebsimage "
                       "commands instead", DeprecationWarning)
     if len(args) != 3:
         self.parser.error(
             'you must specify an instance-id, image name, and bucket')
     bucket = None
     instanceid, image_name, bucket = args
     self.bucket = bucket
     self.image_name = image_name
     i = self.ec2.get_instance(instanceid)
     key_location = self.cfg.get_key(i.key_name).get('key_location')
     aws_user_id = self.cfg.aws.get('aws_user_id')
     ec2_cert = self.cfg.aws.get('ec2_cert')
     ec2_private_key = self.cfg.aws.get('ec2_private_key')
     try:
         ami_id = self.ec2.create_s3_image(instanceid, key_location,
                                           aws_user_id, ec2_cert,
                                           ec2_private_key, bucket,
                                           image_name=image_name,
                                           **self.specified_options_dict)
         log.info("Your new AMI id is: %s" % ami_id)
     except KeyboardInterrupt:
         raise exception.CancelledS3ImageCreation(self.bucket,
                                                  self.image_name)
Example #5
0
 def _upload_image(self):
     log.info('Uploading bundled image: (please be patient)')
     conn = self.host_ssh
     config_dict = self.config_dict
     conn.execute('ec2-upload-bundle -b %(bucket)s '
                  '-m /mnt/%(prefix)s.manifest.xml -a %(access_key)s '
                  '-s %(secret_key)s' % config_dict, silent=False)
Example #6
0
    def export_fs_to_nodes(self, nodes, export_paths):
        """
        Export each path in export_paths to each node in nodes via NFS

        nodes - list of nodes to export each path to
        export_paths - list of paths on this remote host to export to each node

        Example:
        # export /home and /opt/sge6 to each node in nodes
        $ node.start_nfs_server()
        $ node.export_fs_to_nodes(nodes=[node1,node2],
                                  export_paths=['/home', '/opt/sge6'])
        """
        # setup /etc/exports
        log.info("Configuring NFS exports path(s):\n%s" %
                 ' '.join(export_paths))
        nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)"
        etc_exports = self.ssh.remote_file('/etc/exports', 'r')
        contents = etc_exports.read()
        etc_exports.close()
        etc_exports = self.ssh.remote_file('/etc/exports', 'a')
        for node in nodes:
            for path in export_paths:
                export_line = ' '.join(
                    [path, node.alias + nfs_export_settings + '\n'])
                if export_line not in contents:
                    etc_exports.write(export_line)
        etc_exports.close()
        self.ssh.execute('exportfs -fra')
Example #7
0
 def start_nfs_server(self):
     log.info("Starting NFS server on %s" % self.alias)
     self.ssh.execute('/etc/init.d/portmap start')
     self.ssh.execute('mount -t rpc_pipefs sunrpc /var/lib/nfs/rpc_pipefs/',
                      ignore_exit_status=True)
     self.ssh.execute('/etc/init.d/nfs start')
     self.ssh.execute('/usr/sbin/exportfs -fra')
Example #8
0
 def run(self, nodes, master, user, user_shell, volumes):
     if not self.var_str == "":
         for node in nodes:
             log.info("Adding vars to: %s " % node.alias)
             node.ssh.execute('echo \''+self.var_str.replace('\'', '\\\'')+'\' >> .bashrc')
             if self.envar_location is not None:
                 node.ssh.execute('echo \''+self.var_str.replace('\'', '\\\'')+'\' >> '+self.envar_location)
Example #9
0
 def root_device_name(self):
     root_dev = self.instance.root_device_name
     bmap = self.block_device_mapping
     if bmap and root_dev not in bmap and self.is_ebs_backed():
         # Hack for misconfigured AMIs (e.g. CentOS 6.3 Marketplace) These
         # AMIs have root device name set to /dev/sda1 but no /dev/sda1 in
         # block device map - only /dev/sda. These AMIs somehow magically
         # work so check if /dev/sda exists and return that instead to
         # prevent detach_external_volumes() from trying to detach the root
         # volume on these AMIs.
         log.warn("Root device %s is not in the block device map" %
                  root_dev)
         log.warn("This means the AMI was registered with either "
                  "an incorrect root device name or an incorrect block "
                  "device mapping")
         sda, sda1 = '/dev/sda', '/dev/sda1'
         if root_dev == sda1:
             log.info("Searching for possible root device: %s" % sda)
             if sda in self.block_device_mapping:
                 log.warn("Found '%s' - assuming its the real root device" %
                          sda)
                 root_dev = sda
             else:
                 log.warn("Device %s isn't in the block device map either" %
                          sda)
     return root_dev
def run_cmd(node, cmd, user, silent=True):
    log.info("%s@%s: %s" % (user, node.alias, cmd))
    if user != 'root':
        node.ssh.switch_user(user)
    node.ssh.execute(cmd, silent=silent)
    if user != 'root':
        node.ssh.switch_user('root')
Example #11
0
    def run(self):
        """
        As soon as a new node is ready, run the add plugins commands over it.
        """
        interval = self.cluster.refresh_interval
        log.info("Waiting for one of the new nodes to be up "
                 "(updating every {}s)".format(interval))

        while True:
            self.ready_instances = []
            self.stream_unpropagated_spots()
            self.stream_spots()
            self.stream_unpropagated_instances()
            self.stream_update_nrm()
            self.stream_instances()
            self.stream_manage_reboots()
            self.stream_ready_instances()

            if any([self.unpropagated_spots, self.spots,
                    self.unpropagated_instances, self.instances]):
                if self.ready_instances:
                    # ready_instances means nodes were added, that took
                    # time so we should loop again now
                    continue
                log.info("Sleeping for {} seconds".format(interval))
                time.sleep(interval)
            else:
                break
Example #12
0
 def create(self, volume_size, volume_zone, name=None, tags=None):
     try:
         self.validate(volume_size, volume_zone, self._aws_block_device)
         instance = self._request_instance(volume_zone)
         self._validate_required_progs([self._mkfs_cmd.split()[0]])
         self._determine_device()
         vol = self._create_volume(volume_size, volume_zone)
         if tags:
             for tag in tags:
                 tagval = tags.get(tag)
                 tagmsg = "Adding volume tag: %s" % tag
                 if tagval:
                     tagmsg += "=%s" % tagval
                 log.info(tagmsg)
                 vol.add_tag(tag, tagval)
         if name:
             vol.add_tag("Name", name)
         self._attach_volume(self._volume, instance.id,
                             self._aws_block_device)
         self._get_volume_device(self._aws_block_device)
         self._format_volume()
         self.shutdown()
         log.info("Your new %sGB volume %s has been created successfully" %
                  (volume_size, vol.id))
         return vol
     except Exception:
         log.error("Failed to create new volume", exc_info=True)
         self._delete_new_volume()
         raise
     finally:
         self._warn_about_volume_hosts()
Example #13
0
 def _setup_ebs_volume(self):
     """ Mount EBS volume, if specified, in ~/.starclustercfg to /home"""
     # setup /etc/fstab on master to use block device if specified
     for vol in self._volumes:
         vol = self._volumes[vol]
         vol_id = vol.get("volume_id")
         device = vol.get("device")
         volume_partition = vol.get("partition")
         mount_path = vol.get("mount_path")
         if vol_id and volume_partition and mount_path:
             log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
             mconn = self._master.ssh
             if not mconn.path_exists(device):
                 log.warn("Cannot find device %s for volume %s" % (device, vol))
                 log.warn("Not mounting %s on %s" % (vol_id, mount_path))
                 log.warn("This usually means there was a problem" + "attaching the EBS volume to the master node")
                 continue
             if not mconn.path_exists(volume_partition):
                 log.warn("Cannot find partition %s on volume %s" % (volume_partition, vol_id))
                 log.warn("Not mounting %s on %s" % (vol_id, mount_path))
                 log.warn(
                     "This either means that the volume has not been"
                     + "partitioned or that the partition specified"
                     + "does not exist on the volume"
                 )
                 continue
             master_fstab = mconn.remote_file("/etc/fstab", mode="a")
             print >> master_fstab, "%s %s auto noauto,defaults 0 0 " % (volume_partition, mount_path)
             master_fstab.close()
             mconn.execute("mkdir -p %s" % mount_path)
             mconn.execute("mount %s" % mount_path)
Example #14
0
    def openNfsPorts(self, group):
        """
            Open (fixed) NFS-related ports (portmap, nfs and mountd)
        """
        portmapport = self.portmapport
        nfsport     = self.nfsport
        mountdport  = self.mountdport
        
        log.info("Opening NFS-related ports for group: %s", group)
        log.debug("automount.openNfsPorts    group; %s", group)
        log.debug("automount.openNfsPorts    portmapport; %s", portmapport)
        log.debug("automount.openNfsPorts    nfsport; %s", nfsport)
        log.debug("automount.openNfsPorts    mountdport; %s", mountdport)
        
        permissions = [
            dict(group=group, port=nfsport, type="tcp"),
            dict(group=group, port=nfsport, type="udp"),
            dict(group=group, port=portmapport, type="tcp"),
            dict(group=group, port=portmapport, type="udp"),
            dict(group=group, port=mountdport, type="tcp"),
            dict(group=group, port=mountdport, type="udp")
        ]

        #### OPEN PORTS FROM HEAD NODE (NO SSH FROM MASTER)
        commands = self.setPortCommands(group, permissions)
        for command in commands:
            self.runSystemCommand(command);
Example #15
0
 def list_all_instances(self, show_terminated=False):
     reservations = self.conn.get_all_instances()
     if not reservations:
         log.info("No instances found")
     for res in reservations:
         groups = ', '.join([ g.id for g in res.groups]) or 'N/A'
         for instance in res.instances:
             if instance.state == 'terminated' and not show_terminated:
                 continue
             id = instance.id or 'N/A'
             dns_name = instance.dns_name or 'N/A'
             private_dns_name = instance.private_dns_name or 'N/A'
             state = instance.state or 'N/A'
             private_ip = instance.private_ip_address or 'N/A'
             public_ip = instance.ip_address or 'N/A'
             zone = instance.placement or 'N/A'
             ami = instance.image_id or 'N/A'
             keypair = instance.key_name or 'N/A'
             print "id: %s" % id
             print "dns_name: %s" % dns_name
             print "private_dns_name: %s" % private_dns_name
             print "state: %s" % state
             print "public_ip: %s" % public_ip 
             print "private_ip: %s" % private_ip
             print "zone: %s" % zone
             print "ami: %s" % ami
             print "groups: %s" % groups
             print "keypair: %s" % keypair
             print
Example #16
0
    def __init__(self, privatekey, publiccert, interval, sourcedirs, mountpoints, portmapport, nfsport, mountdport, cluster):
        log.info("Loaded plugin: automount.NfsShares")
        log.debug("automount.NfsShares.__init__    Initialising AutoMount plugin.")
        log.debug("automount.NfsShares.__init__    privatekey %s" % privatekey)
        log.debug("automount.NfsShares.__init__    publiccert %s" % publiccert)        
        log.debug("automount.NfsShares.__init__    interval %s" % interval)
        log.debug("automount.NfsShares.__init__    sourcedirs %s" % sourcedirs)
        log.debug("automount.NfsShares.__init__    mountpoints %s" % mountpoints)
        log.debug("automount.NfsShares.__init__    portmapport %s" % portmapport)
        log.debug("automount.NfsShares.__init__    nfsport %s" % nfsport)
        log.debug("automount.NfsShares.__init__    mountdport %s" % mountdport)
        log.debug("automount.NfsShares.__init__    cluster %s" % cluster)
        
        self.privatekey     =   privatekey
        self.publiccert     =   publiccert
        self.portmapport    =   portmapport
        self.nfsport        =   nfsport
        self.mountdport     =   mountdport
        self.cluster        =   cluster

        # set default interval
        if not interval: interval = 10
        self.interval = interval
        self.sourcedirs = sourcedirs.split(",")
        self.mountpoints = mountpoints.split(",")
        
        if len(self.sourcedirs) != len(self.mountpoints):
            log.info("automount.NfsShares.__init__    length of sourcedirs (" 
                + len(self.sourcedirs) 
                + ") is not the same as length of mountpoints ("
                + len(self.mountpoints)
                + ")"
                )
            sys.exit(0)
Example #17
0
    def run(self, nodes, master, user, user_shell, volumes):
        """
            Mount NFS shares on master and all nodes
        """
        log.info("Running plugin automount")
        log.debug("automount.NfsShares.run    automount.NfsShares.run(nodes, master, user, user_shell, volumes)")

        #### OPEN NFS-RELATED PORTS FOR THIS CLUSTER
        self.openNfsPorts("default")
        self.openNfsPorts('@sc-' + self.cluster)

        #### SET HEAD NODE INTERNAL IP
        self.getHeadIp();

        #### FIX mountd PORT ON head AND MASTER/NODES
        mountdport = "32767"
        for node in nodes:
            self.setMountdOnNode(node, mountdport)
        
        self.setMountdOnHead(mountdport)
        self.restartServicesOnHead()

        #### MOUNT ON ALL NODES
        for node in nodes:
            self.mount(node)

        log.info("Completed plugin automount")
Example #18
0
 def wrap_f(func, *arg, **kargs):
     """Raw timing function """
     time1 = time.time()
     res = func(*arg, **kargs)
     time2 = time.time()
     log.info('%s took %0.3f mins' % (prefix, (time2 - time1) / 60.0))
     return res
Example #19
0
 def execute(self, args):
     if not args:
         self.parser.error("please specify a cluster")
     for cluster_name in args:
         cl = self.cm.get_cluster(cluster_name)
         is_ebs = cl.is_ebs_cluster()
         if not self.opts.confirm:
             action = "Terminate"
             if is_ebs:
                 action = "Stop EBS"
                 if cl.spot_bid:
                     action = "Terminate Spot EBS"
             resp = raw_input("%s cluster %s (y/n)? " %
                              (action, cluster_name))
             if resp not in ['y', 'Y', 'yes']:
                 log.info("Aborting...")
                 continue
         cl.stop_cluster()
         if is_ebs and cl._nodes:
             log.warn(("All EBS-backed nodes in '%s' are now in a " + \
                       "'stopped' state") % cluster_name)
             log.warn("You can restart this cluster by passing -x " + \
                      "to the 'start' command")
             log.warn("Use the 'terminate' command to *completely* " + \
                      "terminate this cluster")
             log.warn("NOTE: Unless EBS-backed nodes are in a " + \
                      "'running' or 'terminated'")
             log.warn("state, you are charged for the EBS volumes " + \
                      "backing the nodes.")
Example #20
0
    def addParallelEnvironment(self, master):
        """
            Add 'threaded' parallel environment
        """
        log.info("Adding 'threaded' parallel environment")

        sge_pe_template = """
        pe_name           threaded
        slots             %s
        user_lists        NONE
        xuser_lists       NONE
        start_proc_args   /bin/true
        stop_proc_args    /bin/true
        allocation_rule   $pe_slots
        control_slaves    TRUE
        job_is_first_task FALSE
        urgency_slots     min
        accounting_summary FALSE
        """
        
        log.debug("addParallelEnvironment    sge_pe_template: %s", sge_pe_template)
        
        #### PRINT TEMPLATE FILE
        pe_file = master.ssh.remote_file("/tmp/pe.txt")
        print >> pe_file, sge_pe_template % 99999
        pe_file.close()
        
        envars = self.exportEnvironmentVars()
        
        rootpath = self.getRootPath(master)
        log.debug("CreateCell.addParallelEnvironment    rootpath: %s", rootpath)

        master.ssh.execute(envars + rootpath + "/qconf -Ap %s &> /tmp/pe.out" % pe_file.name)
        master.ssh.execute(envars + rootpath + '/qconf -mattr queue pe_list "threaded" all.q &> /tmp/pe2q.out')
Example #21
0
    def restartSge(self, node):
        """
            Restart SGE qmaster (master) and execd (master + nodes) daemons
        """
        log.info("Restarting SGE qmaster and execd daemons")

        rootpath = self.getRootPath(node)
        log.debug("CreateCell.restartSge    rootpath: %s", rootpath)

        envars = self.exportEnvironmentVars()
        stop_execd      = envars + rootpath + '/qconf -ke all'
        stop_qmaster    = envars + rootpath + '/qconf -km'
        start_qmaster   = envars + rootpath + '/sge_qmaster'
        start_execd     = envars + rootpath + '/sge_execd'
        
        sleep = 1
        log.debug("sge.CreateCell.restartSge    Doing RESTART SGE: %s (%s)", node.alias, node.private_ip_address)

        #### KILL ANY LINGERING TERMINATED PROCESSES    
        killall = "/bin/ps aux | grep sgeadmin | cut -c9-14 | xargs -n1 -iPID /bin/kill -9 PID &> /dev/null"
        log.debug(killall)
        node.ssh.execute(killall, True, False, True)
        killall = "/bin/ps aux | grep root | grep sge | cut -c9-14 | xargs -n1 -iPID /bin/kill -9 PID &> /dev/null"
        log.debug(killall)
        node.ssh.execute(killall, True, False, True)
    
        log.debug("sge.CreateCell.restartSge    node.alias: %s", node.alias)
        if node.alias == "master":            
            time.sleep(float(sleep))
            log.debug("sge.CreateCell.restartSge    %s", start_qmaster)
            node.ssh.execute(start_qmaster)

        log.debug("sge.CreateCell.restartSge    %s", start_execd)
        node.ssh.execute(start_execd)
Example #22
0
    def __init__(self, privatekey, publiccert, cell, execdport, qmasterport, root, slots):
        log.info("Loaded plugin: sge.CreateCell")

        log.debug("sge.CreateCell.__init__    Initialising CreateCell plugin.")
        log.debug("sge.CreateCell.__init__    privatekey %s" % privatekey)
        log.debug("sge.CreateCell.__init__    publiccert %s" % publiccert)
        log.debug("sge.CreateCell.__init__    cell %s" % cell)
        log.debug("sge.CreateCell.__init__    execdport %s" % execdport)
        log.debug("sge.CreateCell.__init__    qmasterport %s" % qmasterport)
        log.debug("sge.CreateCell.__init__    root %s" % root)
        log.debug("sge.CreateCell.__init__    slots %s" % slots)
            
        self.headgroup = "default"
        self.privatekey = privatekey
        self.publiccert = publiccert
        self.cell = cell
        self.execdport = execdport
        self.qmasterport = qmasterport
        self.root = root
        self.slots = slots

        #""" SET HEAD NODE'S ROOT PATH TO SGE BINARIES """
        #rootpath = os.environ['ROOTPATH'];
        #rootpath = re.sub(r'^.', '', rootpath)
        #log.info("rootpath: %s", rootpath)
        #self.rootpath = rootpath
        
        os.environ['SGE_ROOT'] = root
        os.environ['SGE_CELL'] = cell
        os.environ['SGE_QMASTER_PORT'] = qmasterport
        os.environ['SGE_EXECD_PORT'] = execdport
Example #23
0
    def openSgePorts(self):
        """
            Open the particular SGE qmaster and execd daemon ports for this cluster
        """
        log.info("Opening SGE qmaster and execd ports")
        qmasterport = self.qmasterport
        execdport = self.execdport
        cluster = self.cell

        envars = self.exportEnvironmentVars()
        
        log.debug("sge.CreateCell.openSgePorts    qmasterport; %s", qmasterport)
        log.debug("sge.CreateCell.openSgePorts    execdport; %s", execdport)
        log.debug("sge.CreateCell.openSgePorts    envars; %s", envars)

        #### SET EC2 KEY FILE ENVIRONMENT VARIABLES
        ec2vars = "export EC2_PRIVATE_KEY=" + self.privatekey + "; "
        ec2vars += "export EC2_CERT=" + self.publiccert + "; "
        
        # HEAD NODE (I.E., NOT MASTER OR NODE)
        commands = [
            ec2vars + 'ec2-authorize @sc-' + cluster + ' -p ' + execdport + ' -P tcp',
            ec2vars + 'ec2-authorize @sc-' + cluster + ' -p ' + execdport + ' -P udp',
            ec2vars + 'ec2-authorize @sc-' + cluster + ' -p ' + qmasterport + ' -P tcp',
            ec2vars + 'ec2-authorize @sc-' + cluster + ' -p ' + qmasterport + ' -P udp',
            ec2vars + 'ec2-authorize ' + self.headgroup + ' -p ' + execdport + ' -P tcp',
            ec2vars + 'ec2-authorize ' + self.headgroup + ' -p ' + execdport + ' -P udp',
            ec2vars + 'ec2-authorize ' + self.headgroup + ' -p ' + qmasterport + ' -P tcp',
            ec2vars + 'ec2-authorize ' + self.headgroup + ' -p ' + qmasterport + ' -P udp'
        ]
        
        for command in commands:
            self.runSystemCommand(command);
Example #24
0
 def execute(self, args):
     if len(args) != 3:
         self.parser.error('you must specify an instance-id, image name, and bucket')
     instanceid, image_name, bucket = args
     self.bucket = bucket
     self.image_name = image_name
     cfg = self.cfg
     ec2 = cfg.get_easy_ec2()
     i = ec2.get_instance(instanceid)
     if not self.opts.confirm:
         for group in i.groups:
             if group.id.startswith(static.SECURITY_GROUP_PREFIX):
                 log.warn("Instance %s is a StarCluster instance" % i.id)
                 print
                 log.warn("Creating an image from a StarCluster instance " + \
                 "can lead to problems when attempting to use the resulting " + \
                 "image with StarCluster later on")
                 print
                 log.warn(
                 "The recommended way to re-image a StarCluster AMI is " + \
                 "to launch a single instance using either ElasticFox, the " +\
                 "EC2 command line tools, or the AWS management console. " +\
                 "Then login to the instance, modify it, and use this " + \
                 "command to create a new AMI from it.")
                 print
                 resp = raw_input("Continue anyway (y/n)? ")
                 if resp not in ['y','Y','yes']:
                     log.info("Aborting...")
                     sys.exit(1)
                 break
     self.catch_ctrl_c()
     ami_id = image.create_image(instanceid, image_name, bucket, cfg,
                        **self.specified_options_dict)
     log.info("Your new AMI id is: %s" % ami_id)
Example #25
0
    def update_dns(self, host_name, ip_address):
	ttl = 10
	host_name = ".".join([host_name, self.domain])
        conn = boto.connect_route53()

        response = conn.get_all_rrsets(self.hosted_zone_id, 'A', host_name, maxitems=1)
        if len(response):
            response = response[0]
            comment = "Starcluster route53 plugin deleted record for %s"%(host_name)
            changes = ResourceRecordSets(conn, self.hosted_zone_id, comment)
            change1 = changes.add_change("DELETE", host_name, 'A', response.ttl)
            for old_value in response.resource_records:
                change1.add_value(old_value)
            try:
                changes.commit()
                log.info(comment)
            except Exception as e:
                log.warning(e)

        comment = "Starcluster route53 plugin updated record for %s to %s"%(host_name, ip_address)
        changes = ResourceRecordSets(conn, self.hosted_zone_id, comment)
        change2 = changes.add_change("CREATE", host_name, 'A', ttl)
        change2.add_value(ip_address)
        try:
            changes.commit()
            log.info(comment)
        except Exception as e:
            log.warning(e)
Example #26
0
 def run(self, nodes, master, user, user_shell, volumes):
     self._check_ipython_installed(master)
     user_home = master.getpwnam(user).pw_dir
     profile_dir = posixpath.join(user_home, '.ipython', 'profile_default')
     master.ssh.switch_user(user)
     self._write_config(master, user, profile_dir)
     # Start the cluster and some engines on the master (leave 1
     # processor free to handle cluster house keeping)
     cfile, n_engines_master = self._start_cluster(master, profile_dir)
     # Start engines on each of the non-master nodes
     non_master_nodes = [node for node in nodes if not node.is_master()]
     for node in non_master_nodes:
         self.pool.simple_job(
             _start_engines, (node, user, node.num_processors),
             jobid=node.alias)
     n_engines_non_master = sum(node.num_processors
                                for node in non_master_nodes)
     if len(non_master_nodes) > 0:
         log.info("Adding %d engines on %d nodes",
                  n_engines_non_master, len(non_master_nodes))
         self.pool.wait(len(non_master_nodes))
     if self.enable_notebook:
         self._start_notebook(master, user, profile_dir)
     n_engines_total = n_engines_master + n_engines_non_master
     log.info(STARTED_MSG % dict(cluster=master.parent_cluster,
                                 user=user, connector_file=cfile,
                                 key_location=master.key_location,
                                 n_engines=n_engines_total,
                                 n_nodes=len(nodes)))
     master.ssh.switch_user('root')
Example #27
0
 def execute(self, args):
     if len(args) != 1:
         self.parser.error("please specify a <tag_name> for this cluster")
     cfg = self.cfg
     use_experimental = cfg.globals.get('enable_experimental')
     if self.opts.spot_bid is not None and not use_experimental:
         raise exception.ExperimentalFeature('Using spot instances')
     tag = self.tag = args[0]
     template = self.opts.cluster_template
     if not template:
         template = cfg.get_default_cluster_template(tag)
         log.info("Using default cluster template: %s" % template)
     cluster_exists = cluster.cluster_exists(tag, cfg)
     create = not self.opts.no_create
     if not cluster_exists and not create:
         raise exception.ClusterDoesNotExist(tag)
     scluster = cfg.get_cluster_template(template, tag)
     scluster.update(self.specified_options_dict)
     validate_running = self.opts.no_create
     validate_only = self.opts.validate_only
     try:
         scluster._validate(validate_running=validate_running)
         if validate_only:
             return
     except exception.ClusterValidationError,e:
         log.error('settings for cluster template "%s" are not valid:' % template)
         raise
Example #28
0
    def setMasterEtcHosts (self, master):
        log.info("Adding master hostname to own /etc/hosts")
        
        envars = self.exportEnvironmentVars()
        command = "cat /etc/hosts"  
        log.debug("sge.CreateCell.setMasterEtcHosts     command: %s" % command)
        etchosts = etchosts_template
        
        ip_address  = master.ip_address
        dns_name    = master.dns_name

        insert = master.private_ip_address
        insert += "\t"
        insert += self.getHostname(master)
        insert += "\t"
        insert += "localhost"
        etchosts += insert + "\n"

        log.debug("sge.CreateCell.setMasterEtcHosts    AFTER etchosts: %s", etchosts)

        etchosts_file = master.ssh.remote_file("/etc/hosts")
        print >> etchosts_file, etchosts
        etchosts_file.close()
        
        # DEPRECATED:
        #command = "/etc/init.d/networking restart"
        command = "sh -c \"ifdown eth0 && ifup eth0\""
        log.debug("sge.CreateCell.setMasterEtcHosts    command: %s", command)
        result = master.ssh.execute(command)
        log.debug("sge.CreateCell.setMasterEtcHosts    result: %s", result)
Example #29
0
 def run(self, nodes, master, user, user_shell, volumes):
     # set up some paths
     repo_dir = get_repo_dir(user)
     setup_script = get_setup_script(user)
     for node in nodes:
         # NOTE: nodes includes master
         log.info("Installing %s as root on %s" % (project_name, node.alias))
         #
         cmd_strs = [
             # FIXME: do this somewhere else
             'pip install pyparsing==2.0.1',
             'pip install patsy',
             'pip install statsmodels',
             'rm -rf %s' % repo_dir,
             'git clone %s %s' % (repo_url, repo_dir),
             'python %s develop' % setup_script,
             # 'python %s build_ext --inplace' % setup_script,
             'chown -R %s %s' % (user, repo_dir),
         ]
         for cmd_str in cmd_strs:
             node.ssh.execute(cmd_str + ' >out 2>err')
             pass
         pass
     for node in nodes:
         log.info("Setting up %s as %s on %s" % (project_name, user, node.alias))
         #
         cmd_strs = [
             'mkdir -p ~/.matplotlib',
             'echo backend: Agg > ~/.matplotlib/matplotlibrc',
         ]
         for cmd_str in cmd_strs:
             node.shell(user=user, command=cmd_str)
             pass
         pass
     return
Example #30
0
    def enableSchedulingInfo(self):
        """
            Enable job scheduling info output for 'qstat -j'
        """
        log.info("Enabling job scheduling info")

        envars = self.exportEnvironmentVars()
        log.debug(envars + self.rootpath + "/qconf -ssconf")
        queue_template = subprocess.Popen(envars + self.rootpath + "/qconf -ssconf", stdout=subprocess.PIPE, shell=True).stdout.read()
        log.debug("sge.CreateCell.enableSchedulingInfo    BEFORE queue_template: %s", queue_template)

        match = "schedd_job_info                   false"
        insert = "schedd_job_info                   true"
        queue_template = string.replace(queue_template, match, insert)
        log.debug("sge.CreateCell.enableSchedulingInfo    AFTER queue_template: %s", queue_template)

        pid = os.getpid()
        filename = "/tmp/queue-" + str(os.getpid()) + ".txt"
        queue_file = open(filename, 'w')
        print >> queue_file, queue_template
        queue_file.close()
        
        cmd = envars + self.rootpath + "/qconf -Msconf " + filename
        log.debug(cmd)
        os.system(cmd)
        remove = "rm -fr " + filename
        log.debug(remove)
        os.system(remove)
Example #31
0
 def run(self, cluster):
     """
     This function will loop indefinitely, using SGELoadBalancer.get_stats()
     to get the clusters status. It looks at the job queue and tries to
     decide whether to add or remove a node.  It should later look at job
     durations (currently doesn't)
     """
     self._cluster = cluster
     if self.max_nodes is None:
         self.max_nodes = cluster.cluster_size
     use_default_stats_file = self.dump_stats and not self.stats_file
     use_default_plots_dir = self.plot_stats and not self.plot_output_dir
     if use_default_stats_file or use_default_plots_dir:
         self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True)
     if not self.stats_file:
         self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag
     if not self.plot_output_dir:
         self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag
     if not cluster.is_cluster_up():
         raise exception.ClusterNotRunning(cluster.cluster_tag)
     if self.dump_stats:
         if os.path.isdir(self.stats_file):
             raise exception.BaseException("stats file destination '%s' is"
                                           " a directory" % self.stats_file)
         sfdir = os.path.dirname(os.path.abspath(self.stats_file))
         self._validate_dir(sfdir, msg_prefix="stats file destination")
     if self.plot_stats:
         if os.path.isfile(self.plot_output_dir):
             raise exception.BaseException("plot output destination '%s' "
                                           "is a file" %
                                           self.plot_output_dir)
         self._validate_dir(self.plot_output_dir,
                            msg_prefix="plot output destination")
     raw = dict(__raw__=True)
     log.info("Starting load balancer (Use ctrl-c to exit)")
     log.info("Maximum cluster size: %d" % self.max_nodes, extra=raw)
     log.info("Minimum cluster size: %d" % self.min_nodes, extra=raw)
     log.info("Cluster growth rate: %d nodes/iteration\n" %
              self.add_nodes_per_iteration,
              extra=raw)
     if self.dump_stats:
         log.info("Writing stats to file: %s" % self.stats_file)
     if self.plot_stats:
         log.info("Plotting stats to directory: %s" % self.plot_output_dir)
     while (self._keep_polling):
         if not cluster.is_cluster_up():
             log.info("Waiting for all nodes to come up...")
             time.sleep(self.polling_interval)
             continue
         self.get_stats()
         log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw)
         log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()),
                  extra=raw)
         oldest_queued_job_age = self.stat.oldest_queued_job_age()
         if oldest_queued_job_age:
             log.info("Oldest queued job: %s" % oldest_queued_job_age,
                      extra=raw)
         log.info("Avg job duration: %d secs" %
                  self.stat.avg_job_duration(),
                  extra=raw)
         log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(),
                  extra=raw)
         log.info("Last cluster modification time: %s" %
                  self.__last_cluster_mod_time.strftime("%Y-%m-%d %X"),
                  extra=dict(__raw__=True))
         #evaluate if nodes need to be added
         self._eval_add_node()
         #evaluate if nodes need to be removed
         self._eval_remove_node()
         if self.dump_stats or self.plot_stats:
             self.stat.write_stats_to_csv(self.stats_file)
         #call the visualizer
         if self.plot_stats:
             try:
                 self.visualizer.graph_all()
             except IOError, e:
                 raise exception.BaseException(str(e))
         #evaluate if cluster should be terminated
         if self.kill_cluster:
             if self._eval_terminate_cluster():
                 log.info("Terminating cluster and exiting...")
                 return self._cluster.terminate_cluster()
         log.info("Sleeping...(looping again in %d secs)\n" %
                  self.polling_interval)
         time.sleep(self.polling_interval)
Example #32
0
 def _validate_required_progs(self, progs):
     log.info("Checking for required remote commands...")
     self._instance.ssh.check_required(progs)
Example #33
0
 def _attach_volume(self, vol, instance_id, device):
     log.info("Attaching volume %s to instance %s..." %
              (vol.id, instance_id))
     vol.attach(instance_id, device)
     self.ec2.wait_for_volume(vol, state='attached')
     return self._volume
Example #34
0
 def _create_snapshot(self, volume):
     snap = self.ec2.create_snapshot(volume, wait_for_snapshot=True)
     log.info("New snapshot id: %s" % snap.id)
     self._snapshot = snap
     return snap
Example #35
0
 def _create_volume(self, size, zone, snapshot_id=None):
     vol = self.ec2.create_volume(size, zone, snapshot_id)
     self._volume = vol
     log.info("New volume id: %s" % vol.id)
     self.ec2.wait_for_volume(vol, status='available')
     return vol
Example #36
0
    def generate_key_for_user(self,
                              username,
                              ignore_existing=False,
                              auth_new_key=False,
                              auth_conn_key=False):
        """
        Generates an id_rsa/id_rsa.pub keypair combo for a user on the remote
        machine.

        ignore_existing - if False, any existing key combos will be used rather
        than generating a new RSA key

        auth_new_key - if True, add the newly generated public key to the
        remote user's authorized_keys file

        auth_conn_key - if True, add the public key used to establish this ssh
        connection to the remote user's authorized_keys
        """
        user = self.getpwnam(username)
        home_folder = user.pw_dir
        ssh_folder = posixpath.join(home_folder, '.ssh')
        if not self.ssh.isdir(ssh_folder):
            self.ssh.mkdir(ssh_folder)
        private_key = posixpath.join(ssh_folder, 'id_rsa')
        public_key = private_key + '.pub'
        authorized_keys = posixpath.join(ssh_folder, 'authorized_keys')
        key_exists = self.ssh.isfile(private_key)
        if key_exists and not ignore_existing:
            log.info("Using existing key: %s" % private_key)
            key = self.ssh.load_remote_rsa_key(private_key)
        else:
            key = self.ssh.generate_rsa_key()
        pubkey_contents = self.ssh.get_public_key(key)
        if not key_exists or ignore_existing:
            # copy public key to remote machine
            pub_key = self.ssh.remote_file(public_key, 'w')
            pub_key.write(pubkey_contents)
            pub_key.chown(user.pw_uid, user.pw_gid)
            pub_key.chmod(0400)
            pub_key.close()
            # copy private key to remote machine
            priv_key = self.ssh.remote_file(private_key, 'w')
            key.write_private_key(priv_key)
            priv_key.chown(user.pw_uid, user.pw_gid)
            priv_key.chmod(0400)
            priv_key.close()
        if not auth_new_key or not auth_conn_key:
            return key
        auth_keys_contents = ''
        if self.ssh.isfile(authorized_keys):
            auth_keys = self.ssh.remote_file(authorized_keys, 'r')
            auth_keys_contents = auth_keys.read()
            auth_keys.close()
        auth_keys = self.ssh.remote_file(authorized_keys, 'a')
        if auth_new_key:
            # add newly generated public key to user's authorized_keys
            if pubkey_contents not in auth_keys_contents:
                log.debug("adding auth_key_contents")
                auth_keys.write('%s\n' % pubkey_contents)
        if auth_conn_key and self.ssh._pkey:
            # add public key used to create the connection to user's
            # authorized_keys
            conn_key = self.ssh._pkey
            conn_pubkey_contents = self.ssh.get_public_key(conn_key)
            if conn_pubkey_contents not in auth_keys_contents:
                log.debug("adding conn_pubkey_contents")
                auth_keys.write('%s\n' % conn_pubkey_contents)
        auth_keys.chown(user.pw_uid, user.pw_gid)
        auth_keys.chmod(0600)
        auth_keys.close()
        return key
Example #37
0
    def clean_cluster(self, nodes, master, user, user_shell, volumes):
        """
        Run qhost to find nodes that are present in OGS but not in the cluster
        in order to remove them.
        """
        self._master = master
        self._nodes = nodes

        qhost_xml = master.ssh.execute("qhost -xml", source_profile=True)
        qhost_et = ET.fromstringlist(qhost_xml)
        qhosts = []
        for host in qhost_et:
            h_name = host.attrib['name']
            if h_name != 'global':
                qhosts.append(h_name)

        if len(qhosts) == 0:
            log.info("Nothing to clean")

        alive_nodes = [node.alias for node in nodes]

        cleaned = []
        # find dead hosts
        for node_alias in qhosts:
            if node_alias not in alive_nodes:
                cleaned.append(node_alias)

        # find jobs running in dead hosts
        qstats_xml = self._master.ssh.execute("qstat -u \"*\" -xml",
                                              source_profile=True)
        qstats_xml[1:]  # remove first line
        qstats_et = ET.fromstringlist(qstats_xml)
        to_delete = []
        to_repair = []
        cleaned_queue = []  # not a lambda function to allow pickling
        for c in cleaned:
            cleaned_queue.append("all.q@" + c)
        for job_list in qstats_et.find("queue_info").findall("job_list"):
            if job_list.find("queue_name").text in cleaned_queue:
                job_number = job_list.find("JB_job_number").text
                to_delete.append(job_number)
        for job_list in qstats_et.find("job_info").findall("job_list"):
            if job_list.find("state").text == "Eqw":
                job_number = job_list.find("JB_job_number").text
                to_repair.append(job_number)
        # delete the jobs
        if to_delete:
            log.info("Stopping jobs: " + str(to_delete))
            self._master.ssh.execute("qdel -f " + " ".join(to_delete))
            time.sleep(3)  # otherwise might provoke LOST QRSH if on last job
        if to_repair:
            log.error("Reseting jobs: " + str(to_repair))
            self._master.ssh.execute("qmod -cj " + " ".join(to_repair),
                                     ignore_exit_status=True)

        # stuck qrsh issue
        ps_wc = int(self._master.ssh.execute("ps -ef | grep qrsh | wc -l")[0])
        qstat_wc = int(self._master.ssh.execute("qstat -u \"*\" | wc -l")[0])
        if qstat_wc == 0 and ps_wc > 2:
            log.error("LOST QRSH??")
            log.error("pkill -9 qrsh")
            self._master.ssh.execute("pkill -9 qrsh", ignore_exit_status=True)
        # ----------------------------------

        # delete the host config
        for c in cleaned:
            log.info("Cleaning node " + c)
            if len(master.ssh.get_remote_file_lines("/etc/hosts", c)) == 0:
                log.warn(c + " is missing from /etc/hosts, creating a dummy "
                         "entry 1.1.1.1")
                rfile = master.ssh.remote_file("/etc/hosts", 'a')
                rfile.write("1.1.1.1 " + c + "\n")
                rfile.close()

            try:
                self._remove_from_sge(DeadNode(c), only_clean_master=True)
            except RemoteCommandFailed:
                log.warning("Failed to remove node {} from sge."
                            .format(c), exc_info=True)

        # fix to allow pickling
        self._master = None
        self._nodes = None
Example #38
0
 def run(self, nodes, master, user, shell, volumes):
     secretword = self._generate_secretword()
     aliases = map(lambda x: x.alias, nodes)
     for node in nodes:
         log.info("Installing mpich2 on node %s" % node.alias)
         node.ssh.execute("apt-get -y install mpich2")
         log.info("Configuring %s on node %s" %
                  (self.MPD_HOSTS, node.alias))
         mpd_hosts = node.ssh.remote_file(self.MPD_HOSTS, 'w')
         mpd_hosts.write('\n'.join(aliases))
         mpd_hosts.close()
         log.info("Configuring %s on node %s for root" %
                  (self.MPD_CONF, node.alias))
         mpd_conf = node.ssh.remote_file(self.MPD_CONF, 'w')
         mpd_conf.write("secretword=%s\n" % secretword)
         mpd_conf.chmod(0600)
         mpd_conf.close()
     user_home = master.getpwnam(user).pw_dir
     user_mpd_conf = posixpath.join(user_home, '.mpd.conf')
     log.info("Configuring %s for user %s" % (user_mpd_conf, user))
     secretword = self._generate_secretword()
     umpdconf = node.ssh.remote_file(user_mpd_conf)
     umpdconf.write("secretword=%s\n" % secretword)
     umpdconf.chmod(0600)
     umpdconf.close()
     log.info("Launching mpdboot for root")
     master.ssh.execute('mpdboot -f %s -n %d' %
                        (self.MPD_HOSTS, len(nodes)))
     log.info("Launching mpdboot for user %s" % user)
     master.ssh.execute("su -l -c 'mpdboot -f %s -n %d' %s" % \
                        (self.MPD_HOSTS, len(nodes), user))
Example #39
0
 def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
     log.info("Removing %s from ipcluster" % node.alias)
     less_nodes = filter(lambda x: x.id != node.id, nodes)
     self._create_cluster_file(master, less_nodes)
     node.ssh.execute('pkill ipengine')
Example #40
0
    def run(self, nodes, master, user, shell, volumes):
        aliases = [n.alias for n in nodes]

        log.info("Installing SBT")
        for node in nodes:
            self.pool.simple_job(self._install_sbt, (node), jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing S3 and Boto")
        for node in nodes:
            self.pool.simple_job(self._install_s3_and_boto, (node),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing Cython, IPython Notebook, py4j and Matplotlib")
        for node in nodes:
            self.pool.simple_job(self._install_ipython_notebook, (node),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("installing GEOS")
        for node in nodes:
            self.pool.simple_job(self._install_geos, (node), jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing Basemap")
        for node in nodes:
            self.pool.simple_job(self._install_basemap, (node),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing PyProj")
        for node in nodes:
            self.pool.simple_job(self._install_pyproj, (node),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing PyGrib")
        for node in nodes:
            self.pool.simple_job(self._install_pygrib, (node),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing Pydoop")
        for node in nodes:
            self.pool.simple_job(self._install_pydoop, (node),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing HDF5 and Libcurl")
        for node in nodes:
            self.pool.simple_job(self._install_hdf5, (node), jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing NetCDF-4-C")
        for node in nodes:
            self.pool.simple_job(self._install_netcdf, (node),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Installing PyNetCDF4")
        for node in nodes:
            self.pool.simple_job(self._install_pynetcdf4, (node),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))

        log.info("Configuring IPython PySpark profile and startup script")
        self._configure_ipython(master)

        log.info("Opening port for IPython Notebook")
        self._open_ports(master)

        log.info("Don't forget to configure s3cmd with s3cmd --configure!")
Example #41
0
 def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
     self._master = master
     log.info("No need to remove %s from EFS" % node.alias)
Example #42
0
 def _generate_secretword(self):
     log.info("Generating MPICH secretword")
     secretword = map(lambda x: x, string.ascii_lowercase + string.digits)
     random.shuffle(secretword)
     return ''.join(secretword)
Example #43
0
class CmdStart(ClusterCompleter):
    """
    start [options] <cluster_tag>

    Start a new cluster

    Example:

        $ starcluster start mynewcluster

    This will launch a cluster named "mynewcluster" using the settings from
    the default cluster template defined in the configuration file. The
    default cluster template is specified by the 'default_template' option in
    the [global] section of the config. To use another template besides the
    default use the -c (--cluster-template) option:

        $ starcluster start -c largecluster mynewcluster

    This will launch a cluster named "mynewcluster" using the settings from
    the "largecluster" cluster template instead of the default template.
    """
    names = ['start']

    def addopts(self, parser):
        templates = []
        if self.cfg:
            templates = self.cfg.clusters.keys()
        parser.add_option("-x",
                          "--no-create",
                          dest="no_create",
                          action="store_true",
                          default=False,
                          help="do not launch new EC2 instances when "
                          "starting cluster (use existing instances instead)")
        parser.add_option("-o",
                          "--create-only",
                          dest="create_only",
                          action="store_true",
                          default=False,
                          help="only launch/start EC2 instances, "
                          "do not perform any setup routines")
        parser.add_option("-v",
                          "--validate-only",
                          dest="validate_only",
                          action="store_true",
                          default=False,
                          help="only validate cluster settings, do "
                          "not start a cluster")
        parser.add_option("-V",
                          "--skip-validation",
                          dest="validate",
                          action="store_false",
                          default=True,
                          help="do not validate cluster settings")
        parser.add_option("-l",
                          "--login-master",
                          dest="login_master",
                          action="store_true",
                          default=False,
                          help="login to master node after launch")
        parser.add_option("-q",
                          "--disable-queue",
                          dest="disable_queue",
                          action="store_true",
                          default=None,
                          help="do not configure a queueing system (SGE)")
        parser.add_option("-Q",
                          "--enable-queue",
                          dest="disable_queue",
                          action="store_false",
                          default=None,
                          help="configure a queueing system (SGE) (default)")
        parser.add_option("--force-spot-master",
                          dest="force_spot_master",
                          action="store_true",
                          default=None,
                          help="when creating a spot cluster "
                          "the default is to launch the master as "
                          "a flat-rate instance for stability. this option "
                          "forces launching the master node as a spot "
                          "instance when a spot cluster is requested.")
        parser.add_option("--no-spot-master",
                          dest="force_spot_master",
                          action="store_false",
                          default=None,
                          help="Do not launch the master node as a spot "
                          "instance when a spot cluster is requested. "
                          "(default)")
        parser.add_option("--public-ips",
                          dest="public_ips",
                          default=None,
                          action='store_true',
                          help="Assign public IPs to all VPC nodes "
                          "(VPC clusters only)"),
        parser.add_option("--no-public-ips",
                          dest="public_ips",
                          default=None,
                          action='store_false',
                          help="Do NOT assign public ips to all VPC nodes "
                          "(VPC clusters only) (default)"),
        opt = parser.add_option("-c",
                                "--cluster-template",
                                action="store",
                                dest="cluster_template",
                                choices=templates,
                                default=None,
                                help="cluster template to use "
                                "from the config file")
        if completion:
            opt.completer = completion.ListCompleter(opt.choices)
        parser.add_option("-r",
                          "--refresh-interval",
                          dest="refresh_interval",
                          type="int",
                          action="callback",
                          default=None,
                          callback=self._positive_int,
                          help="refresh interval when waiting for cluster "
                          "nodes to come up (default: 30)")
        parser.add_option("-b",
                          "--bid",
                          dest="spot_bid",
                          action="store",
                          type="float",
                          default=None,
                          help="requests spot instances instead of flat "
                          "rate instances. Uses SPOT_BID as max bid for "
                          "the request.")
        parser.add_option("-d",
                          "--description",
                          dest="cluster_description",
                          action="store",
                          type="string",
                          default="Cluster requested at %s" %
                          time.strftime("%Y%m%d%H%M"),
                          help="brief description of cluster")
        parser.add_option("-s",
                          "--cluster-size",
                          dest="cluster_size",
                          action="callback",
                          type="int",
                          default=None,
                          callback=self._positive_int,
                          help="number of ec2 instances to launch")
        parser.add_option("-u",
                          "--cluster-user",
                          dest="cluster_user",
                          action="store",
                          type="string",
                          default=None,
                          help="name of user to create on cluster "
                          "(defaults to sgeadmin)")
        opt = parser.add_option("-S",
                                "--cluster-shell",
                                dest="cluster_shell",
                                action="store",
                                choices=static.AVAILABLE_SHELLS.keys(),
                                default=None,
                                help="shell for cluster user "
                                "(defaults to bash)")
        if completion:
            opt.completer = completion.ListCompleter(opt.choices)
        parser.add_option("-m",
                          "--master-image-id",
                          dest="master_image_id",
                          action="store",
                          type="string",
                          default=None,
                          help="AMI to use when launching master")
        parser.add_option("-n",
                          "--node-image-id",
                          dest="node_image_id",
                          action="store",
                          type="string",
                          default=None,
                          help="AMI to use when launching nodes")
        parser.add_option("-I",
                          "--master-instance-type",
                          dest="master_instance_type",
                          action="store",
                          choices=sorted(static.INSTANCE_TYPES.keys()),
                          default=None,
                          help="instance type for the master "
                          "instance")
        opt = parser.add_option("-i",
                                "--node-instance-type",
                                dest="node_instance_type",
                                action="store",
                                choices=sorted(static.INSTANCE_TYPES.keys()),
                                default=None,
                                help="instance type for the node instances")
        if completion:
            opt.completer = completion.ListCompleter(opt.choices)
        parser.add_option("-a",
                          "--availability-zone",
                          dest="availability_zone",
                          action="store",
                          type="string",
                          default=None,
                          help="availability zone to launch instances in")
        parser.add_option("-k",
                          "--keyname",
                          dest="keyname",
                          action="store",
                          type="string",
                          default=None,
                          help="name of the keypair to use when "
                          "launching the cluster")
        parser.add_option("-K",
                          "--key-location",
                          dest="key_location",
                          action="store",
                          type="string",
                          default=None,
                          metavar="FILE",
                          help="path to an ssh private key that matches the "
                          "cluster keypair")
        parser.add_option("-U",
                          "--userdata-script",
                          dest="userdata_scripts",
                          action="append",
                          default=None,
                          metavar="FILE",
                          help="Path to userdata script that will run on "
                          "each node on start-up. Can be used multiple times.")
        parser.add_option("-P",
                          "--dns-prefix",
                          dest="dns_prefix",
                          action='store_true',
                          help="Prefix dns names of all nodes in the cluster "
                          "with the cluster tag")
        parser.add_option("-p",
                          "--no-dns-prefix",
                          dest="dns_prefix",
                          action='store_false',
                          help="Do NOT prefix dns names of all nodes in the "
                          "cluster with the cluster tag (default)")
        # This option is disabled because we need to use nargs='+' which is
        # supported by argparse but not optparse. Use cluster template
        # configuration key SUBNET_IDS instead.
        # parser.add_option("-N", "--subnet-id", dest="subnet_id",
        #                   action="store", type="string",
        #                   help=("Launch cluster into a VPC subnet"))
        parser.add_option("--config-on-master",
                          default=False,
                          action='store_true',
                          help="Store the config on the "
                          "master node rather than into the security group "
                          "tags")
        parser.add_option("--dns-sufix",
                          action='store_true',
                          help="Sufix dns names of all nodes in the cluster "
                          "with the cluster tag.")

    def execute(self, args):
        if len(args) != 1:
            self.parser.error("please specify a <cluster_tag>")
        tag = args[0]
        if tag.find("master") > -1:
            # Because of Node.is_master
            raise exception.ClusterValidationError("Cluster name cannot "
                                                   "contain master")

        create = not self.opts.no_create
        scluster = self.cm.get_cluster_group_or_none(tag)
        if scluster and create:
            scluster = self.cm.get_cluster(tag,
                                           group=scluster,
                                           load_receipt=False,
                                           require_keys=False)
            stopped_ebs = scluster.is_cluster_stopped()
            is_ebs = False
            if not stopped_ebs:
                is_ebs = scluster.is_ebs_cluster()
            raise exception.ClusterExists(tag,
                                          is_ebs=is_ebs,
                                          stopped_ebs=stopped_ebs)
        if not create and not scluster:
            raise exception.ClusterDoesNotExist(tag)
        create_only = self.opts.create_only
        validate = self.opts.validate
        validate_running = self.opts.no_create
        validate_only = self.opts.validate_only
        config_on_master = self.opts.config_on_master

        if scluster:
            if config_on_master:
                scluster = self.cm.get_cluster(tag,
                                               group=scluster,
                                               load_receipt=False)
                validate_running = False
            else:
                scluster = self.cm.get_cluster(tag, group=scluster)
                validate_running = True
        else:
            template = self.opts.cluster_template
            if not template:
                try:
                    template = self.cm.get_default_cluster_template()
                except exception.NoDefaultTemplateFound, e:
                    try:
                        ctmpl = e.options[0]
                    except IndexError:
                        ctmpl = "smallcluster"
                    e.msg += " \n\nAlternatively, you can specify a cluster "
                    e.msg += "template to use by passing the '-c' option to "
                    e.msg += "the 'start' command, e.g.:\n\n"
                    e.msg += "    $ starcluster start -c %s %s" % (ctmpl, tag)
                    raise e
                log.info("Using default cluster template: %s" % template)
            scluster = self.cm.get_cluster_template(template, tag)
        scluster.update(self.specified_options_dict)
        if self.opts.keyname and not self.opts.key_location:
            key = self.cfg.get_key(self.opts.keyname)
            scluster.key_location = key.key_location
        if not self.opts.refresh_interval:
            interval = self.cfg.globals.get("refresh_interval")
            if interval is not None:
                scluster.refresh_interval = interval
        if self.opts.spot_bid is not None and not self.opts.no_create:
            msg = user_msgs.spotmsg % {
                'size': scluster.cluster_size,
                'tag': tag
            }
            if not validate_only and not create_only:
                self.warn_experimental(msg, num_secs=5)
        if self.opts.dns_prefix:
            if tag.find(".") > -1:
                raise exception.ClusterValidationError(
                    "Cannot use --dns-prefix when the cluster tag contains "
                    "a dot.")
            scluster.dns_prefix = tag
        if self.opts.dns_sufix:
            scluster.dns_sufix = tag
        if config_on_master:
            scluster.config_on_master = True
            if self.opts.no_create:
                validate = False
                log.warning("Cannot start a cluster when its config is "
                            "stored on the master node using StarCluster. "
                            "You should start it manually and then use "
                            "the recovery options.")
                return
        try:
            scluster.start(create=create,
                           create_only=create_only,
                           validate=validate,
                           validate_only=validate_only,
                           validate_running=validate_running,
                           save_config_on_master=self.opts.config_on_master)
        except KeyboardInterrupt:
            if validate_only:
                raise
            else:
                raise exception.CancelledStartRequest(tag)
        if validate_only:
            return
        if not create_only and not self.opts.login_master:
            log.info(user_msgs.cluster_started_msg %
                     dict(tag=scluster.cluster_tag),
                     extra=dict(__textwrap__=True, __raw__=True))
        if self.opts.login_master:
            scluster.ssh_to_master()
Example #44
0
 def run(self, nodes, master, user, user_shell, volumes):
     self._create_cluster_file(master, nodes)
     log.info("Starting ipcluster...")
     master.ssh.execute(
         "su - %s -c 'screen -d -m ipcluster ssh --clusterfile %s'" % \
         (user, self.cluster_file))
Example #45
0
class CmdStart(ClusterCompleter):
    """
    start [options] <cluster_tag>

    Start a new cluster

    Example:

        $ starcluster start mynewcluster

    This will launch a cluster named "mynewcluster" using the settings from
    the default cluster template defined in the configuration file. The
    default cluster template is specified by the 'default_template' option in
    the [global] section of the config. To use another template besides the
    default use the -c (--cluster-template) option:

        $ starcluster start -c largecluster mynewcluster

    This will launch a cluster named "mynewcluster" using the settings from
    the "largecluster" cluster template instead of the default template.
    """
    names = ['start']

    tag = None

    def addopts(self, parser):
        templates = []
        if self.cfg:
            templates = self.cfg.get_cluster_names().keys()
        parser.add_option("-x",
                          "--no-create",
                          dest="no_create",
                          action="store_true",
                          default=False,
                          help="do not launch new EC2 instances when "
                          "starting cluster (use existing instances instead)")
        parser.add_option("-o",
                          "--create-only",
                          dest="create_only",
                          action="store_true",
                          default=False,
                          help="only launch/start EC2 instances, "
                          "do not perform any setup routines")
        parser.add_option("-v",
                          "--validate-only",
                          dest="validate_only",
                          action="store_true",
                          default=False,
                          help="only validate cluster settings, do "
                          "not start a cluster")
        parser.add_option("-V",
                          "--skip-validation",
                          dest="validate",
                          action="store_false",
                          default=True,
                          help="do not validate cluster settings")
        parser.add_option("-l",
                          "--login-master",
                          dest="login_master",
                          action="store_true",
                          default=False,
                          help="login to master node after launch")
        parser.add_option("-q",
                          "--disable-queue",
                          dest="disable_queue",
                          action="store_true",
                          default=None,
                          help="do not configure a queueing system (SGE)")
        parser.add_option("--force-spot-master",
                          dest="force_spot_master",
                          action="store_true",
                          default=None,
                          help="when creating a spot cluster "
                          "the default is to launch the master as "
                          "a flat-rate instance for stability. this option "
                          "forces launching the master node as a spot "
                          "instance when a spot cluster is requested.")
        opt = parser.add_option("-c",
                                "--cluster-template",
                                action="store",
                                dest="cluster_template",
                                choices=templates,
                                default=None,
                                help="cluster template to use "
                                "from the config file")
        if optcomplete:
            opt.completer = optcomplete.ListCompleter(opt.choices)
        parser.add_option("-r",
                          "--refresh-interval",
                          dest="refresh_interval",
                          type="int",
                          action="callback",
                          default=None,
                          callback=self._positive_int,
                          help="refresh interval when waiting for cluster "
                          "nodes to come up (default: 30)")
        parser.add_option("-b",
                          "--bid",
                          dest="spot_bid",
                          action="store",
                          type="float",
                          default=None,
                          help="requests spot instances instead of flat "
                          "rate instances. Uses SPOT_BID as max bid for "
                          "the request.")
        parser.add_option("-d",
                          "--description",
                          dest="cluster_description",
                          action="store",
                          type="string",
                          default="Cluster requested at %s" %
                          time.strftime("%Y%m%d%H%M"),
                          help="brief description of cluster")
        parser.add_option("-s",
                          "--cluster-size",
                          dest="cluster_size",
                          action="callback",
                          type="int",
                          default=None,
                          callback=self._positive_int,
                          help="number of ec2 instances to launch")
        parser.add_option("-u",
                          "--cluster-user",
                          dest="cluster_user",
                          action="store",
                          type="string",
                          default=None,
                          help="name of user to create on cluster "
                          "(defaults to sgeadmin)")
        opt = parser.add_option("-S",
                                "--cluster-shell",
                                dest="cluster_shell",
                                action="store",
                                choices=static.AVAILABLE_SHELLS.keys(),
                                default=None,
                                help="shell for cluster user "
                                "(defaults to bash)")
        if optcomplete:
            opt.completer = optcomplete.ListCompleter(opt.choices)
        parser.add_option("-m",
                          "--master-image-id",
                          dest="master_image_id",
                          action="store",
                          type="string",
                          default=None,
                          help="AMI to use when launching master")
        parser.add_option("-n",
                          "--node-image-id",
                          dest="node_image_id",
                          action="store",
                          type="string",
                          default=None,
                          help="AMI to use when launching nodes")
        parser.add_option("-I",
                          "--master-instance-type",
                          dest="master_instance_type",
                          action="store",
                          choices=static.INSTANCE_TYPES.keys(),
                          default=None,
                          help="instance type for the master instance")
        opt = parser.add_option("-i",
                                "--node-instance-type",
                                dest="node_instance_type",
                                action="store",
                                choices=static.INSTANCE_TYPES.keys(),
                                default=None,
                                help="instance type for the node instances")
        if optcomplete:
            opt.completer = optcomplete.ListCompleter(opt.choices)
        parser.add_option("-a",
                          "--availability-zone",
                          dest="availability_zone",
                          action="store",
                          type="string",
                          default=None,
                          help="availability zone to launch instances in")
        parser.add_option("-k",
                          "--keyname",
                          dest="keyname",
                          action="store",
                          type="string",
                          default=None,
                          help="name of the keypair to use when "
                          "launching the cluster")
        parser.add_option("-K",
                          "--key-location",
                          dest="key_location",
                          action="store",
                          type="string",
                          default=None,
                          metavar="FILE",
                          help="path to an ssh private key that matches the "
                          "cluster keypair")

    def cancel_command(self, signum, frame):
        raise exception.CancelledStartRequest(self.tag)

    def execute(self, args):
        if len(args) != 1:
            self.parser.error("please specify a <cluster_tag>")
        tag = self.tag = args[0]
        create = not self.opts.no_create
        create_only = self.opts.create_only
        scluster = self.cm.get_cluster_or_none(tag, require_keys=False)
        validate = self.opts.validate
        validate_running = self.opts.no_create
        validate_only = self.opts.validate_only
        if scluster and create:
            stopped_ebs = scluster.is_cluster_stopped()
            is_ebs = False
            if not stopped_ebs:
                is_ebs = scluster.is_ebs_cluster()
            raise exception.ClusterExists(tag,
                                          is_ebs=is_ebs,
                                          stopped_ebs=stopped_ebs)
        if not scluster and not create:
            raise exception.ClusterDoesNotExist(tag)
        elif scluster:
            validate_running = True
        else:
            template = self.opts.cluster_template
            if not template:
                try:
                    template = self.cm.get_default_cluster_template()
                except exception.NoDefaultTemplateFound, e:
                    try:
                        ctmpl = e.options[0]
                    except IndexError:
                        ctmpl = "smallcluster"
                    e.msg += " \n\nAlternatively, you can specify a cluster "
                    e.msg += "template to use by passing the '-c' option to "
                    e.msg += "the 'start' command, e.g.:\n\n"
                    e.msg += "    $ starcluster start -c %s %s" % (ctmpl, tag)
                    raise e
                log.info("Using default cluster template: %s" % template)
            scluster = self.cm.get_cluster_template(template, tag)
        scluster.update(self.specified_options_dict)
        if self.opts.keyname and not self.opts.key_location:
            key = self.cfg.get_key(self.opts.keyname)
            scluster.key_location = key.key_location
        if not self.opts.refresh_interval:
            interval = self.cfg.globals.get("refresh_interval")
            if interval is not None:
                scluster.refresh_interval = interval
        if self.opts.spot_bid is not None and not self.opts.no_create:
            msg = user_msgs.spotmsg % {
                'size': scluster.cluster_size,
                'tag': tag
            }
            if not validate_only and not create_only:
                self.warn_experimental(msg, num_secs=5)
        self.catch_ctrl_c()
        scluster.start(create=create,
                       create_only=create_only,
                       validate=validate,
                       validate_only=validate_only,
                       validate_running=validate_running)
        if validate_only:
            return
        if not create_only and not self.opts.login_master:
            log.info(user_msgs.cluster_started_msg %
                     dict(tag=scluster.cluster_tag),
                     extra=dict(__textwrap__=True, __raw__=True))
        if self.opts.login_master:
            scluster.ssh_to_master()
Example #46
0
    def on_add_node(self, node, nodes, master, user, user_shell, volumes):
        self._master = master
        self._new_security_group = node.cluster_groups[0].id

        log.info("Adding %s to EFS" % node.alias)
        self._install_efs_on_node(node)
Example #47
0
 def on_add_node(self, node, nodes, master, user, user_shell, volumes):
     log.info("Adding %s to TMUX Control Center" % node.alias)
     self._add_to_tmuxcc(master, node, user='******')
     self._add_to_tmuxcc(master, node, user=user)
Example #48
0
class CmdShell(CmdBase):
    """
    shell

    Load an interactive IPython shell configured for starcluster development

    The following objects are automatically available at the prompt:

        cfg - starcluster.config.StarClusterConfig instance
        cm - starcluster.cluster.ClusterManager instance
        ec2 - starcluster.awsutils.EasyEC2 instance
        s3 - starcluster.awsutils.EasyS3 instance

    All StarCluster modules are automatically imported in the IPython session
    along with all StarCluster dependencies (e.g. boto, ssh, etc.)

    If the --ipcluster=CLUSTER (-p) is passed, the IPython session will be
    automatically be configured to connect to the remote CLUSTER using
    IPython's parallel interface (requires IPython 0.11+). In this mode you
    will have the following additional objects available at the prompt:

        ipcluster - starcluster.cluster.Cluster instance for the cluster
        ipclient - IPython.parallel.Client instance for the cluster
        ipview - IPython.parallel.client.view.DirectView for the cluster

    Here's an example of how to run a parallel map across all nodes in the
    cluster:

        [~]> ipclient.ids
        [0, 1, 2, 3]
        [~]> res = ipview.map_async(lambda x: x**30, range(8))
        [~]> print res.get()
        [0,
         1,
         1073741824,
         205891132094649L,
         1152921504606846976L,
         931322574615478515625L,
         221073919720733357899776L,
         22539340290692258087863249L]

    See IPython parallel docs for more details
    (http://ipython.org/ipython-doc/stable/parallel)
    """

    names = ['shell', 'sh']

    def _add_to_known_hosts(self, node):
        log.info("Configuring local known_hosts file")
        user_home = os.path.expanduser('~')
        khosts = os.path.join(user_home, '.ssh', 'known_hosts')
        if not os.path.isfile(khosts):
            log.warn("Unable to configure known_hosts: file does not exist")
            return
        contents = open(khosts).read()
        if node.dns_name not in contents:
            server_pkey = node.ssh.get_server_public_key()
            khostsf = open(khosts, 'a')
            if contents[-1] != '\n':
                khostsf.write('\n')
            name_entry = '%s,%s' % (node.dns_name, node.ip_address)
            khostsf.write(' '.join([
                name_entry,
                server_pkey.get_name(),
                base64.b64encode(str(server_pkey)), '\n'
            ]))
            khostsf.close()

    def addopts(self, parser):
        parser.add_option("-p",
                          "--ipcluster",
                          dest="ipcluster",
                          action="store",
                          type="string",
                          default=None,
                          metavar="CLUSTER",
                          help="configure a parallel "
                          "IPython session on CLUSTER")

    def execute(self, args):
        local_ns = dict(cfg=self.cfg,
                        ec2=self.ec2,
                        s3=self.s3,
                        cm=self.cm,
                        starcluster=starcluster,
                        log=log)
        if self.opts.ipcluster:
            log.info("Loading parallel IPython library")
            try:
                from IPython.parallel import Client
            except ImportError, e:
                self.parser.error(
                    "Error loading parallel IPython:"
                    "\n\n%s\n\n"
                    "NOTE: IPython 0.11+ must be installed to use -p" % e)
            tag = self.opts.ipcluster
            cl = self.cm.get_cluster(tag)
            region = cl.master_node.region.name
            ipcluster_dir = os.path.join(static.STARCLUSTER_CFG_DIR,
                                         'ipcluster')
            local_json = os.path.join(ipcluster_dir,
                                      "%s-%s.json" % (tag, region))
            if not os.path.exists(local_json):
                user_home = cl.master_node.getpwnam(cl.cluster_user).pw_dir
                profile_dir = posixpath.join(user_home, '.ipython',
                                             'profile_default')
                json = posixpath.join(profile_dir, 'security',
                                      'ipcontroller-client.json')
                if cl.master_node.ssh.isfile(json):
                    log.info("Fetching connector file from cluster...")
                    if not os.path.exists(ipcluster_dir):
                        os.makedirs(ipcluster_dir)
                    cl.master_node.ssh.get(json, local_json)
                else:
                    self.parser.error(
                        "IPython json file %s does not exist locally or on "
                        "the cluster. Make sure the ipcluster plugin has "
                        "been executed and completed successfully.")
            key_location = cl.master_node.key_location
            self._add_to_known_hosts(cl.master_node)
            log.info("Loading parallel IPython client and view")
            rc = Client(local_json, sshkey=key_location)
            local_ns['Client'] = Client
            local_ns['ipcluster'] = cl
            local_ns['ipclient'] = rc
            local_ns['ipview'] = rc[:]
        modules = [(starcluster.__name__ + '.' + module, module)
                   for module in starcluster.__all__]
        modules += [('boto', 'boto'), ('paramiko', 'paramiko'),
                    ('workerpool', 'workerpool'), ('jinja2', 'jinja2'),
                    ('pyasn1', 'pyasn1'), ('iptools', 'iptools')]
        for fullname, modname in modules:
            log.info('Importing module %s' % modname)
            try:
                __import__(fullname)
                local_ns[modname] = sys.modules[fullname]
            except ImportError, e:
                log.error("Error loading module %s: %s" % (modname, e))
Example #49
0
    def _configure_hadoop(self, master, nodes, user):
        log.info("Configuring Hadoop...")
        log.info("Adding user %s to hadoop group" % user)
        for node in nodes:
            self.pool.simple_job(self._setup_hadoop_user, (node, user),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))
        node_aliases = map(lambda n: n.alias, nodes)
        cfg = {'master': master.alias, 'replication': 3,
               'hadoop_tmpdir': posixpath.join(self.hadoop_tmpdir,
                                               'hadoop-${user.name}'), 'node_type': 'namenode'}
        log.info("Installing configuration templates...")
        # for node in nodes:
        #     self.pool.simple_job(self._install_empty_conf, (node,),
        #                          jobid=node.alias)
        # self.pool.wait(numtasks=len(nodes))
        log.info("Configuring environment...")
        for node in nodes:
            self.pool.simple_job(self._configure_env, (node,),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))
        log.info("Configuring Core Site...")
        for node in nodes:
            self.pool.simple_job(self._configure_core, (node, cfg),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))
        log.info("Configuring YARN Site...")
        for node in nodes:
            self.pool.simple_job(self._configure_yarn, (node, cfg),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))
        log.info("Configuring MapReduce Site...")
        for node in nodes:
            self.pool.simple_job(self._configure_mapreduce_site, (node, cfg),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))
        

	log.info("Configuring HDFS Site...")
        for node in nodes:
	    if not node.is_master():
		cfg.update({'node_type': 'datanode'})
		
            self.pool.simple_job(self._configure_hdfs_site, (node, cfg),
                                 jobid=node.alias)

        self.pool.wait(numtasks=len(nodes))
        log.info("Configuring masters file...")
        for node in nodes:
            self.pool.simple_job(self._configure_masters, (node, master),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))
        log.info("Configuring slaves file...")
        for node in nodes:
            self.pool.simple_job(self._configure_slaves, (node, node_aliases),
                                 jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))
        
	# log.info("Configuring HDFS...")
        # for node in nodes:
        #     self.pool.simple_job(self._setup_hdfs, (node, user),
        #                          jobid=node.alias)
        # self.pool.wait(numtasks=len(nodes))
        log.info("Configuring dumbo...")
        for node in nodes:
            self.pool.simple_job(self._setup_dumbo, (node,), jobid=node.alias)
        self.pool.wait(numtasks=len(nodes))
Example #50
0
 def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
     log.info("Removing %s from TMUX Control Center" % node.alias)
     self._remove_from_tmuxcc(master, node, user='******')
     self._remove_from_tmuxcc(master, node, user=user)
Example #51
0
 def _setup_hadoop_user(self, node, user):
     log.info("Skipping setup-hadoop-user...")
Example #52
0
 def run(self, nodes, master, user, user_shell, volumes):
     #self._configure_hadoop(master, nodes, user)
     #self._start_hadoop(master, nodes)
     #self._open_ports(master)
     log.info("Job tracker status: http://%s:54311" % master.dns_name)
     log.info("Namenode status: http://%s:50070" % master.dns_name)
Example #53
0
 def _format_volume(self):
     log.info("Formatting volume...")
     self._instance.ssh.execute('%s -F %s' % (self._mkfs_cmd, self._device),
                                silent=False)
Example #54
0
 def _install_empty_conf(self, node):
     log.info("Skipping install-empty-conf...")
Example #55
0
 def _setup_ebs_volumes(self):
     """
     Mount EBS volumes, if specified in ~/.starcluster/config to /home
     """
     # setup /etc/fstab on master to use block device if specified
     master = self._master
     devs = master.ssh.ls('/dev')
     for vol in self._volumes:
         vol = self._volumes[vol]
         vol_id = vol.get("volume_id")
         mount_path = vol.get('mount_path')
         device = vol.get("device")
         volume_partition = vol.get('partition')
         if not (vol_id and device and mount_path):
             log.error("missing required settings for vol %s" % vol)
             continue
         dev_exists = master.ssh.path_exists(device)
         if not dev_exists and device.startswith('/dev/sd'):
             # check for "correct" device in unpatched kernels
             device = device.replace('/dev/sd', '/dev/xvd')
             dev_exists = master.ssh.path_exists(device)
         if not dev_exists:
             log.warn("Cannot find device %s for volume %s" %
                      (device, vol_id))
             log.warn("Not mounting %s on %s" % (vol_id, mount_path))
             log.warn("This usually means there was a problem "
                      "attaching the EBS volume to the master node")
             continue
         if not volume_partition:
             partitions = filter(lambda x: x.startswith(device), devs)
             if len(partitions) == 1:
                 volume_partition = device
             elif len(partitions) == 2:
                 volume_partition = device + '1'
             else:
                 log.error(
                     "volume has more than one partition, please specify "
                     "which partition to use (e.g. partition=0, "
                     "partition=1, etc.) in the volume's config")
                 continue
         elif not master.ssh.path_exists(volume_partition):
             log.warn("Cannot find partition %s on volume %s" %
                      (volume_partition, vol_id))
             log.warn("Not mounting %s on %s" % (vol_id, mount_path))
             log.warn("This either means that the volume has not "
                      "been partitioned or that the partition"
                      "specified does not exist on the volume")
             continue
         log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
         mount_map = self._master.get_mount_map()
         dev = mount_map.get(volume_partition)
         if dev:
             path, fstype, options = dev
             if path != mount_path:
                 log.error("Volume %s is mounted on %s, not on %s" %
                           (vol_id, path, mount_path))
             else:
                 log.info("Volume %s already mounted on %s...skipping" %
                          (vol_id, mount_path))
             continue
         self._master.mount_device(volume_partition, mount_path)
Example #56
0
 def on_remove_node(self, remove_node, nodes, master, user, user_shell,
                    volumes):
     log.info("Removing %s from MPICH2 hosts file" % remove_node.alias)
     master.ssh.remove_lines_from_file(self.MPICH2_HOSTS, remove_node.alias)
Example #57
0
 def on_add_node(self, node, nodes, master, user, user_shell, volumes):
     log.info("Adding %s to TMUX Control Center" % node.alias)
Example #58
0
 def _eval_add_node(self):
     """
     This function inspects the current state of the SGE queue and decides
     whether or not to add nodes to the cluster. Returns the number of nodes
     to add.
     """
     num_nodes = len(self._cluster.nodes)
     if num_nodes >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     queued_jobs = self.stat.get_queued_jobs()
     if not queued_jobs and num_nodes >= self.min_nodes:
         log.info("Not adding nodes: at or above minimum nodes "
                  "and no queued jobs...")
         return
     total_slots = self.stat.count_total_slots()
     if not self.has_cluster_stabilized() and total_slots > 0:
         return
     running_jobs = self.stat.get_running_jobs()
     used_slots = sum([int(j['slots']) for j in running_jobs])
     qw_slots = sum([int(j['slots']) for j in queued_jobs])
     slots_per_host = self.stat.slots_per_host()
     avail_slots = total_slots - used_slots
     need_to_add = 0
     if num_nodes < self.min_nodes:
         log.info("Adding node: below minimum (%d)" % self.min_nodes)
         need_to_add = self.min_nodes - num_nodes
     elif total_slots == 0:
         # no slots, add one now
         need_to_add = 1
     elif qw_slots > avail_slots:
         log.info("Queued jobs need more slots (%d) than available (%d)" %
                  (qw_slots, avail_slots))
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d seconds "
                      "longer than max: %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             if slots_per_host != 0:
                 need_to_add = qw_slots / slots_per_host
             else:
                 need_to_add = 1
         else:
             log.info("No queued jobs older than %d seconds" %
                      self.longest_allowed_queue_time)
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(utils.get_utc_now())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = utils.get_utc_now()
             log.info("Done adding nodes at %s" %
                      str(self.__last_cluster_mod_time))
         except Exception:
             log.error("Failed to add new host", exc_info=True)
Example #59
0
 def do_shutdown(self):
     log.info("Shutting down server...")
     self.send_response(200)
     self.send_header('Content-type', 'text/html')
     self.end_headers()
     self.server.stop = True
Example #60
0
 def on_remove_node(self, node, nodes, master, user, user_shell, volumes):
     log.info("Removing %s from TMUX Control Center" % node.alias)