Example #1
0
    def _setup_slurm_conf(self):
        """
        Setup ``slurm.conf`` configuration file
        """
        def _worker_nodes_conf():
            """
            Compose the conf lines pertaining to the worker nodes. Return two
            lists of strings: a one-per-line node specifications (eg,
            ``NodeName=w1 NodeAddr=<private_IP> Weight=5 State=UNKNOWN``) and
            a list of node names (eg, w1, w2).
            Note that only nodes in status ``Ready`` or ``Startup`` will be
            included.
            """
            wnc = ''
            wnn = ''
            for i, w in enumerate(self.app.manager.worker_instances):
                if w.worker_status in ['Ready', 'Startup']:
                    wnc += ('NodeName={0} NodeAddr={1} CPUs={2} RealMemory={3} Weight=5 State=UNKNOWN\n'
                            .format(w.alias, w.private_ip, w.num_cpus,
                                    max(1, w.total_memory / 1024)))
                    wnn += ',{0}'.format(w.alias)
            log.debug("Worker node names to include in slurm.conf: {0}".format(wnn[1:]))
            return wnc, wnn

        def _build_slurm_conf():
            log.debug("Setting slurm.conf parameters")
            # Make sure the slurm root dir exists and is owned by slurm user
            misc.make_dir(self.app.path_resolver.slurm_root_tmp)
            os.chown(self.app.path_resolver.slurm_root_tmp,
                     pwd.getpwnam("slurm")[2], grp.getgrnam("slurm")[2])
            worker_nodes, worker_names = _worker_nodes_conf()
            slurm_conf_template = conf_manager.load_conf_template(conf_manager.SLURM_CONF_TEMPLATE)
            slurm_conf_params = {
                "master_hostname": misc.get_hostname(),
                "num_cpus": max(self.app.manager.num_cpus - 1, 1),  # Reserve 1 CPU
                "total_memory": max(1, self.app.manager.total_memory / 1024),
                "slurm_root_tmp": self.app.path_resolver.slurm_root_tmp,
                "worker_nodes": worker_nodes,
                "worker_names": worker_names
            }
            return slurm_conf_template.substitute(slurm_conf_params)

        if not os.path.exists(self.app.path_resolver.slurm_root_nfs):
            misc.make_dir(self.app.path_resolver.slurm_root_nfs)
        nfs_slurm_conf = self.app.path_resolver.slurm_conf_nfs
        local_slurm_conf = self.app.path_resolver.slurm_conf_local
        # Ocasionally, NFS file is unavailable so try a few times
        for i in range(5):
            with flock(self.slurm_lock_file):
                log.debug("Setting up {0} (attempt {1}/5)".format(nfs_slurm_conf, i))
                try:
                    with open(nfs_slurm_conf, 'w') as f:
                        print >> f, _build_slurm_conf()
                    log.debug("Created slurm.conf as {0}".format(nfs_slurm_conf))
                    break
                except IOError, e:
                    log.error("Trouble creating {0}: {1}".format(nfs_slurm_conf, e))
                    time.sleep(2)
Example #2
0
    def _setup_slurm_conf(self):
        """
        Setup ``slurm.conf`` configuration file
        """
        def _worker_nodes_conf():
            """
            Compose the conf lines pertaining to the worker nodes. Return two
            lists of strings: a one-per-line node specifications (eg,
            ``NodeName=w1 NodeAddr=<private_IP> Weight=5 State=UNKNOWN``) and
            a list of node names (eg, w1, w2).
            Note that only nodes in status ``Ready`` or ``Startup`` will be
            included.
            """
            wnc = ''
            wnn = ''
            for i, w in enumerate(self.app.manager.worker_instances):
                if w.worker_status in ['Ready', 'Startup']:
                    wnc += ('NodeName={0} NodeAddr={1} CPUs={2} RealMemory={3} Weight=5 State=UNKNOWN\n'
                            .format(w.alias, w.private_ip, w.num_cpus,
                                    max(1, w.total_memory / 1024)))
                    wnn += ',{0}'.format(w.alias)
            log.debug("Worker node names to include in slurm.conf: {0}".format(wnn[1:]))
            return wnc, wnn

        def _build_slurm_conf():
            log.debug("Setting slurm.conf parameters")
            # Make sure the slurm root dir exists and is owned by slurm user
            misc.make_dir(self.app.path_resolver.slurm_root_tmp)
            os.chown(self.app.path_resolver.slurm_root_tmp,
                     pwd.getpwnam("slurm")[2], grp.getgrnam("slurm")[2])
            worker_nodes, worker_names = _worker_nodes_conf()
            slurm_conf_template = conf_manager.load_conf_template(conf_manager.SLURM_CONF_TEMPLATE)
            slurm_conf_params = {
                "master_hostname": misc.get_hostname(),
                "num_cpus": max(self.app.manager.num_cpus - 1, 1),  # Reserve 1 CPU
                "total_memory": max(1, self.app.manager.total_memory / 1024),
                "slurm_root_tmp": self.app.path_resolver.slurm_root_tmp,
                "worker_nodes": worker_nodes,
                "worker_names": worker_names
            }
            return slurm_conf_template.substitute(slurm_conf_params)

        if not os.path.exists(self.app.path_resolver.slurm_root_nfs):
            misc.make_dir(self.app.path_resolver.slurm_root_nfs)
        nfs_slurm_conf = self.app.path_resolver.slurm_conf_nfs
        local_slurm_conf = self.app.path_resolver.slurm_conf_local
        # Ocasionally, NFS file is unavailable so try a few times
        for i in range(5):
            with flock(self.slurm_lock_file):
                log.debug("Setting up {0} (attempt {1}/5)".format(nfs_slurm_conf, i))
                try:
                    with open(nfs_slurm_conf, 'w') as f:
                        print >> f, _build_slurm_conf()
                    log.debug("Created slurm.conf as {0}".format(nfs_slurm_conf))
                    break
                except IOError, e:
                    log.error("Trouble creating {0}: {1}".format(nfs_slurm_conf, e))
                    time.sleep(2)
Example #3
0
    def reload_nfs_exports(force=False):
        """
        Reloads NFS exports if required (dirty flag is set).

        :type force: bool
        :param force: Force reload, even if dirty flag is not set. Default is False.
        """
        if NFSExport.nfs_dirty or force:
            with flock(NFSExport.nfs_lock_file):
                if run("/etc/init.d/nfs-kernel-server restart", "Error restarting NFS server",
                        "Successfully restarted NFS server"):
                    NFSExport.nfs_dirty = False
Example #4
0
 def status(self):
     """
     Do a status update for the current file system, checking
     if the file system is mounted to a location based on its name.
     Set state to RUNNING if the file system is accessible, otherwise
     set state to ERROR.
     """
     # log.debug("Updating service '%s-%s' status; current state: %s" \
     #   % (self.svc_type, self.name, self.state))
     if self.dirty:
         # First check if the NFS server needs to be restarted but do it one thread at a time
         with flock(self.nfs_lock_file):
             if run("/etc/init.d/nfs-kernel-server restart", "Error restarting NFS server", \
                 "As part of %s filesystem update, successfully restarted NFS server" \
                 % self.name):
                 self.dirty = False
     # Transient storage file system has its own process for checking status
     if len(self.transient_storage) > 0:
         for ts in self.transient_storage:
             ts.status()
         return
     # TODO: Move volume-specific checks into volume.py
     if self._service_transitioning():
         pass
     elif self._service_starting():
         pass
     elif self.mount_point is not None:
         mnt_location = commands.getstatusoutput("cat /proc/mounts | grep %s | cut -d' ' -f1,2" \
             % self.mount_point)
         if mnt_location[0] == 0 and mnt_location[1] != '':
             try:
                 device, mnt_path = mnt_location[1].split(' ')
                 # Check volume(s) if part of the file system (but because boto
                 # filtering works only on ec2, for now, do this check only on ec2)
                 if len(self.volumes) > 0:
                     self.check_and_update_volume(self._get_attach_device_from_device(device))
                 # Check mount point
                 if mnt_path == self.mount_point:
                     self.state = service_states.RUNNING
                     self._update_size()
                 else:
                     log.error("STATUS CHECK: Retrieved mount path '%s' does not match "
                         "expected path '%s'" % (mnt_location[1], self.mount_point))
                     self.state = service_states.ERROR
             except Exception, e:
                 log.error("STATUS CHECK: Exception checking status of FS '%s': %s" % (self.name, e))
                 self.state = service_states.ERROR
                 log.debug(mnt_location)
         else:
             log.error("STATUS CHECK: File system named '%s' is not mounted. Error code %s" \
                 % (self.name, mnt_location[0]))
             self.state = service_states.ERROR
Example #5
0
    def reload_nfs_exports(force=False):
        """
        Reloads NFS exports if required (dirty flag is set).

        :type force: bool
        :param force: Force reload, even if dirty flag is not set. Default is False.
        """
        if NFSExport.nfs_dirty or force:
            with flock(NFSExport.nfs_lock_file):
                if run("/etc/init.d/nfs-kernel-server restart",
                       "Error restarting NFS server",
                       "Successfully restarted NFS server"):
                    NFSExport.nfs_dirty = False
Example #6
0
    def add_nfs_share(mount_point, permissions='rw'):
        """
        Share the given/current file system/mount point over NFS. Note that
        if the given mount point already exists in /etc/exports, replace
        the existing line with the line composed within this method.

        :type mount_point: string
        :param mount_point: The mount point to add to the NFS share

        :type permissions: string
        :param permissions: Choose the type of permissions for the hosts
                            mounting this NFS mount point. Use: 'rw' for
                            read-write (default) or 'ro' for read-only
        """
        log.debug("Will attempt to share mount point {0} over NFS.".format(
            mount_point))
        try:
            if not mount_point:
                raise Exception("add_nfs_share: No mount point provided")
            # Compose the line that will be put into /etc/exports
            # NOTE: with Spot instances, should we use 'async' vs. 'sync' option?
            # See: http://linux.die.net/man/5/exports
            ee_line = "{mp}\t*({perms},sync,no_root_squash,no_subtree_check)\n"\
                .format(mp=mount_point, perms=permissions)
            # Make sure we manipulate ee_file by a single process at a time
            with flock(NFSExport.nfs_lock_file):
                # Determine if the given mount point is already shared
                with open(NFSExport.ee_file) as f:
                    shared_paths = f.readlines()

                mp_line_no = NFSExport.find_mount_point_entry(mount_point)
                # If the mount point is already in /etc/exports, replace the existing
                # entry with the newly composed ee_line (thus supporting change of
                # permissions). Otherwise, append ee_line to the end of the
                # file.
                if mp_line_no > -1:
                    shared_paths[mp_line_no] = ee_line
                else:
                    shared_paths.append(ee_line)
                # Write out the newly composed file
                with open(NFSExport.ee_file, 'w') as f:
                    f.writelines(shared_paths)
                log.debug("Added '{0}' line to NFS file {1}".format(
                    ee_line.strip(), NFSExport.ee_file))
            # Mark the NFS server as being in need of a restart
            NFSExport.nfs_dirty = True
            return True
        except Exception, e:
            log.error("Error configuring {0} file for NFS: {1}".format(
                NFSExport.ee_file, e))
            return False
Example #7
0
    def add_nfs_share(mount_point, permissions='rw'):
        """
        Share the given/current file system/mount point over NFS. Note that
        if the given mount point already exists in /etc/exports, replace
        the existing line with the line composed within this method.

        :type mount_point: string
        :param mount_point: The mount point to add to the NFS share

        :type permissions: string
        :param permissions: Choose the type of permissions for the hosts
                            mounting this NFS mount point. Use: 'rw' for
                            read-write (default) or 'ro' for read-only
        """
        log.debug("Will attempt to share mount point {0} over NFS.".format(mount_point))
        try:
            if not mount_point:
                raise Exception("add_nfs_share: No mount point provided")
            # Compose the line that will be put into /etc/exports
            # NOTE: with Spot instances, should we use 'async' vs. 'sync' option?
            # See: http://linux.die.net/man/5/exports
            ee_line = "{mp}\t*({perms},sync,no_root_squash,no_subtree_check)\n"\
                .format(mp=mount_point, perms=permissions)
            # Make sure we manipulate ee_file by a single process at a time
            with flock(NFSExport.nfs_lock_file):
                # Determine if the given mount point is already shared
                with open(NFSExport.ee_file) as f:
                    shared_paths = f.readlines()

                mp_line_no = NFSExport.find_mount_point_entry(mount_point)
                # If the mount point is already in /etc/exports, replace the existing
                # entry with the newly composed ee_line (thus supporting change of
                # permissions). Otherwise, append ee_line to the end of the
                # file.
                if mp_line_no > -1:
                    shared_paths[mp_line_no] = ee_line
                else:
                    shared_paths.append(ee_line)
                # Write out the newly composed file
                with open(NFSExport.ee_file, 'w') as f:
                    f.writelines(shared_paths)
                log.debug("Added '{0}' line to NFS file {1}".format(
                    ee_line.strip(), NFSExport.ee_file))
            # Mark the NFS server as being in need of a restart
            NFSExport.nfs_dirty = True
            return True
        except Exception, e:
            log.error(
                "Error configuring {0} file for NFS: {1}".format(NFSExport.ee_file, e))
            return False
Example #8
0
 def unmount(self, mount_point):
     """
     Unmount the file system from the specified mount point, removing it from
     NFS in the process.
     """
     try:
         mp = mount_point.replace('/', '\/')  # Escape slashes for sed
         # Because we're unmounting the file systems in separate threads, use a lock file
         with flock(self.fs.nfs_lock_file):
             if run("/bin/sed -i 's/^%s/#%s/' /etc/exports" % (mp, mp),
                     "Error removing '%s' from '/etc/exports'" % mount_point,
                     "Successfully removed '%s' from '/etc/exports'" % mount_point):
                 self.fs.dirty = True
     except Exception, e:
         log.debug("Problems configuring NFS or /etc/exports: '%s'" % e)
         return False
Example #9
0
 def remove_nfs_share(self, mount_point=None):
     """
     Remove the given/current file system/mount point from being shared
     over NFS. The method removes the file system's ``mount_point`` from
     ``/etc/share`` and indcates that the NFS server needs restarting.
     """
     try:
         ee_file = '/etc/exports'
         if mount_point is None:
             mount_point = self.mount_point
         mount_point = mount_point.replace('/', '\/') # Escape slashes for sed
         cmd = "sed -i '/^{0}/d' {1}".format(mount_point, ee_file)
         log.debug("Removing NSF share for mount point {0}; cmd: {1}".format(mount_point, cmd))
         # To avoid race conditions between threads, use a lock file
         with flock(self.nfs_lock_file):
             run(cmd)
         self.dirty = True
         return True
     except Exception, e:
         log.error("Error removing FS {0} share from NFS: {1}".format(mount_point, e))
         return False
Example #10
0
 def remove_nfs_share(mount_point):
     """
     Remove the given/current file system/mount point from being shared
     over NFS. The method removes the file system's ``mount_point`` from
     ``/etc/share`` and indicates that the NFS server needs restarting.
     """
     try:
         if not mount_point:
             raise Exception("remove_nfs_share: No mount point provided")
         mount_point = mount_point.replace(
             '/', '\/')  # Escape slashes for sed
         cmd = "sed -i '/^{0}/d' {1}".format(mount_point, NFSExport.ee_file)
         log.debug("Removing NSF share for mount point {0}; cmd: {1}".format(
             mount_point, cmd))
         # To avoid race conditions between threads, use a lock file
         with flock(NFSExport.nfs_lock_file):
             run(cmd)
         NFSExport.nfs_dirty = True
         return True
     except Exception, e:
         log.error("Error removing FS {0} share from NFS: {1}".format(
             mount_point, e))
         return False
Example #11
0
 def remove_nfs_share(mount_point):
     """
     Remove the given/current file system/mount point from being shared
     over NFS. The method removes the file system's ``mount_point`` from
     ``/etc/share`` and indicates that the NFS server needs restarting.
     """
     try:
         if not mount_point:
             raise Exception("remove_nfs_share: No mount point provided")
         mount_point = mount_point.replace('/',
                                           '\/')  # Escape slashes for sed
         cmd = "sed -i '/^{0}/d' {1}".format(mount_point, NFSExport.ee_file)
         log.debug(
             "Removing NSF share for mount point {0}; cmd: {1}".format(
                 mount_point, cmd))
         # To avoid race conditions between threads, use a lock file
         with flock(NFSExport.nfs_lock_file):
             run(cmd)
         NFSExport.nfs_dirty = True
         return True
     except Exception, e:
         log.error("Error removing FS {0} share from NFS: {1}".format(
             mount_point, e))
         return False