def _setup_slurm_conf(self): """ Setup ``slurm.conf`` configuration file """ def _worker_nodes_conf(): """ Compose the conf lines pertaining to the worker nodes. Return two lists of strings: a one-per-line node specifications (eg, ``NodeName=w1 NodeAddr=<private_IP> Weight=5 State=UNKNOWN``) and a list of node names (eg, w1, w2). Note that only nodes in status ``Ready`` or ``Startup`` will be included. """ wnc = '' wnn = '' for i, w in enumerate(self.app.manager.worker_instances): if w.worker_status in ['Ready', 'Startup']: wnc += ('NodeName={0} NodeAddr={1} CPUs={2} RealMemory={3} Weight=5 State=UNKNOWN\n' .format(w.alias, w.private_ip, w.num_cpus, max(1, w.total_memory / 1024))) wnn += ',{0}'.format(w.alias) log.debug("Worker node names to include in slurm.conf: {0}".format(wnn[1:])) return wnc, wnn def _build_slurm_conf(): log.debug("Setting slurm.conf parameters") # Make sure the slurm root dir exists and is owned by slurm user misc.make_dir(self.app.path_resolver.slurm_root_tmp) os.chown(self.app.path_resolver.slurm_root_tmp, pwd.getpwnam("slurm")[2], grp.getgrnam("slurm")[2]) worker_nodes, worker_names = _worker_nodes_conf() slurm_conf_template = conf_manager.load_conf_template(conf_manager.SLURM_CONF_TEMPLATE) slurm_conf_params = { "master_hostname": misc.get_hostname(), "num_cpus": max(self.app.manager.num_cpus - 1, 1), # Reserve 1 CPU "total_memory": max(1, self.app.manager.total_memory / 1024), "slurm_root_tmp": self.app.path_resolver.slurm_root_tmp, "worker_nodes": worker_nodes, "worker_names": worker_names } return slurm_conf_template.substitute(slurm_conf_params) if not os.path.exists(self.app.path_resolver.slurm_root_nfs): misc.make_dir(self.app.path_resolver.slurm_root_nfs) nfs_slurm_conf = self.app.path_resolver.slurm_conf_nfs local_slurm_conf = self.app.path_resolver.slurm_conf_local # Ocasionally, NFS file is unavailable so try a few times for i in range(5): with flock(self.slurm_lock_file): log.debug("Setting up {0} (attempt {1}/5)".format(nfs_slurm_conf, i)) try: with open(nfs_slurm_conf, 'w') as f: print >> f, _build_slurm_conf() log.debug("Created slurm.conf as {0}".format(nfs_slurm_conf)) break except IOError, e: log.error("Trouble creating {0}: {1}".format(nfs_slurm_conf, e)) time.sleep(2)
def reload_nfs_exports(force=False): """ Reloads NFS exports if required (dirty flag is set). :type force: bool :param force: Force reload, even if dirty flag is not set. Default is False. """ if NFSExport.nfs_dirty or force: with flock(NFSExport.nfs_lock_file): if run("/etc/init.d/nfs-kernel-server restart", "Error restarting NFS server", "Successfully restarted NFS server"): NFSExport.nfs_dirty = False
def status(self): """ Do a status update for the current file system, checking if the file system is mounted to a location based on its name. Set state to RUNNING if the file system is accessible, otherwise set state to ERROR. """ # log.debug("Updating service '%s-%s' status; current state: %s" \ # % (self.svc_type, self.name, self.state)) if self.dirty: # First check if the NFS server needs to be restarted but do it one thread at a time with flock(self.nfs_lock_file): if run("/etc/init.d/nfs-kernel-server restart", "Error restarting NFS server", \ "As part of %s filesystem update, successfully restarted NFS server" \ % self.name): self.dirty = False # Transient storage file system has its own process for checking status if len(self.transient_storage) > 0: for ts in self.transient_storage: ts.status() return # TODO: Move volume-specific checks into volume.py if self._service_transitioning(): pass elif self._service_starting(): pass elif self.mount_point is not None: mnt_location = commands.getstatusoutput("cat /proc/mounts | grep %s | cut -d' ' -f1,2" \ % self.mount_point) if mnt_location[0] == 0 and mnt_location[1] != '': try: device, mnt_path = mnt_location[1].split(' ') # Check volume(s) if part of the file system (but because boto # filtering works only on ec2, for now, do this check only on ec2) if len(self.volumes) > 0: self.check_and_update_volume(self._get_attach_device_from_device(device)) # Check mount point if mnt_path == self.mount_point: self.state = service_states.RUNNING self._update_size() else: log.error("STATUS CHECK: Retrieved mount path '%s' does not match " "expected path '%s'" % (mnt_location[1], self.mount_point)) self.state = service_states.ERROR except Exception, e: log.error("STATUS CHECK: Exception checking status of FS '%s': %s" % (self.name, e)) self.state = service_states.ERROR log.debug(mnt_location) else: log.error("STATUS CHECK: File system named '%s' is not mounted. Error code %s" \ % (self.name, mnt_location[0])) self.state = service_states.ERROR
def add_nfs_share(mount_point, permissions='rw'): """ Share the given/current file system/mount point over NFS. Note that if the given mount point already exists in /etc/exports, replace the existing line with the line composed within this method. :type mount_point: string :param mount_point: The mount point to add to the NFS share :type permissions: string :param permissions: Choose the type of permissions for the hosts mounting this NFS mount point. Use: 'rw' for read-write (default) or 'ro' for read-only """ log.debug("Will attempt to share mount point {0} over NFS.".format( mount_point)) try: if not mount_point: raise Exception("add_nfs_share: No mount point provided") # Compose the line that will be put into /etc/exports # NOTE: with Spot instances, should we use 'async' vs. 'sync' option? # See: http://linux.die.net/man/5/exports ee_line = "{mp}\t*({perms},sync,no_root_squash,no_subtree_check)\n"\ .format(mp=mount_point, perms=permissions) # Make sure we manipulate ee_file by a single process at a time with flock(NFSExport.nfs_lock_file): # Determine if the given mount point is already shared with open(NFSExport.ee_file) as f: shared_paths = f.readlines() mp_line_no = NFSExport.find_mount_point_entry(mount_point) # If the mount point is already in /etc/exports, replace the existing # entry with the newly composed ee_line (thus supporting change of # permissions). Otherwise, append ee_line to the end of the # file. if mp_line_no > -1: shared_paths[mp_line_no] = ee_line else: shared_paths.append(ee_line) # Write out the newly composed file with open(NFSExport.ee_file, 'w') as f: f.writelines(shared_paths) log.debug("Added '{0}' line to NFS file {1}".format( ee_line.strip(), NFSExport.ee_file)) # Mark the NFS server as being in need of a restart NFSExport.nfs_dirty = True return True except Exception, e: log.error("Error configuring {0} file for NFS: {1}".format( NFSExport.ee_file, e)) return False
def add_nfs_share(mount_point, permissions='rw'): """ Share the given/current file system/mount point over NFS. Note that if the given mount point already exists in /etc/exports, replace the existing line with the line composed within this method. :type mount_point: string :param mount_point: The mount point to add to the NFS share :type permissions: string :param permissions: Choose the type of permissions for the hosts mounting this NFS mount point. Use: 'rw' for read-write (default) or 'ro' for read-only """ log.debug("Will attempt to share mount point {0} over NFS.".format(mount_point)) try: if not mount_point: raise Exception("add_nfs_share: No mount point provided") # Compose the line that will be put into /etc/exports # NOTE: with Spot instances, should we use 'async' vs. 'sync' option? # See: http://linux.die.net/man/5/exports ee_line = "{mp}\t*({perms},sync,no_root_squash,no_subtree_check)\n"\ .format(mp=mount_point, perms=permissions) # Make sure we manipulate ee_file by a single process at a time with flock(NFSExport.nfs_lock_file): # Determine if the given mount point is already shared with open(NFSExport.ee_file) as f: shared_paths = f.readlines() mp_line_no = NFSExport.find_mount_point_entry(mount_point) # If the mount point is already in /etc/exports, replace the existing # entry with the newly composed ee_line (thus supporting change of # permissions). Otherwise, append ee_line to the end of the # file. if mp_line_no > -1: shared_paths[mp_line_no] = ee_line else: shared_paths.append(ee_line) # Write out the newly composed file with open(NFSExport.ee_file, 'w') as f: f.writelines(shared_paths) log.debug("Added '{0}' line to NFS file {1}".format( ee_line.strip(), NFSExport.ee_file)) # Mark the NFS server as being in need of a restart NFSExport.nfs_dirty = True return True except Exception, e: log.error( "Error configuring {0} file for NFS: {1}".format(NFSExport.ee_file, e)) return False
def unmount(self, mount_point): """ Unmount the file system from the specified mount point, removing it from NFS in the process. """ try: mp = mount_point.replace('/', '\/') # Escape slashes for sed # Because we're unmounting the file systems in separate threads, use a lock file with flock(self.fs.nfs_lock_file): if run("/bin/sed -i 's/^%s/#%s/' /etc/exports" % (mp, mp), "Error removing '%s' from '/etc/exports'" % mount_point, "Successfully removed '%s' from '/etc/exports'" % mount_point): self.fs.dirty = True except Exception, e: log.debug("Problems configuring NFS or /etc/exports: '%s'" % e) return False
def remove_nfs_share(self, mount_point=None): """ Remove the given/current file system/mount point from being shared over NFS. The method removes the file system's ``mount_point`` from ``/etc/share`` and indcates that the NFS server needs restarting. """ try: ee_file = '/etc/exports' if mount_point is None: mount_point = self.mount_point mount_point = mount_point.replace('/', '\/') # Escape slashes for sed cmd = "sed -i '/^{0}/d' {1}".format(mount_point, ee_file) log.debug("Removing NSF share for mount point {0}; cmd: {1}".format(mount_point, cmd)) # To avoid race conditions between threads, use a lock file with flock(self.nfs_lock_file): run(cmd) self.dirty = True return True except Exception, e: log.error("Error removing FS {0} share from NFS: {1}".format(mount_point, e)) return False
def remove_nfs_share(mount_point): """ Remove the given/current file system/mount point from being shared over NFS. The method removes the file system's ``mount_point`` from ``/etc/share`` and indicates that the NFS server needs restarting. """ try: if not mount_point: raise Exception("remove_nfs_share: No mount point provided") mount_point = mount_point.replace( '/', '\/') # Escape slashes for sed cmd = "sed -i '/^{0}/d' {1}".format(mount_point, NFSExport.ee_file) log.debug("Removing NSF share for mount point {0}; cmd: {1}".format( mount_point, cmd)) # To avoid race conditions between threads, use a lock file with flock(NFSExport.nfs_lock_file): run(cmd) NFSExport.nfs_dirty = True return True except Exception, e: log.error("Error removing FS {0} share from NFS: {1}".format( mount_point, e)) return False
def remove_nfs_share(mount_point): """ Remove the given/current file system/mount point from being shared over NFS. The method removes the file system's ``mount_point`` from ``/etc/share`` and indicates that the NFS server needs restarting. """ try: if not mount_point: raise Exception("remove_nfs_share: No mount point provided") mount_point = mount_point.replace('/', '\/') # Escape slashes for sed cmd = "sed -i '/^{0}/d' {1}".format(mount_point, NFSExport.ee_file) log.debug( "Removing NSF share for mount point {0}; cmd: {1}".format( mount_point, cmd)) # To avoid race conditions between threads, use a lock file with flock(NFSExport.nfs_lock_file): run(cmd) NFSExport.nfs_dirty = True return True except Exception, e: log.error("Error removing FS {0} share from NFS: {1}".format( mount_point, e)) return False