def __init__(self, vo_id, storage=None, rest_client=None, host_institute=GENT): """Initialise""" super(VscTier2AccountpageVo, self).__init__(vo_id, rest_client) self.vo_id = vo_id self.vsc = VSC() self.host_institute = host_institute if not storage: self.storage = VscStorage() else: self.storage = storage self.gpfs = GpfsOperations() self.posix = PosixOperations() self.dry_run = False self._vo_data_quota_cache = None self._vo_data_shared_quota_cache = None self._vo_scratch_quota_cache = None self._institute_quota_cache = None self._sharing_group_cache = None
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) stats = {} try: gpfs = GpfsOperations() filesets = gpfs.list_filesets() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem,)] = INODE_STORE_LOG_CRITICAL try: filename = "gpfs_inodes_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem,)] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem]) logger.info("Processed inodes information for filesystem %s" % (filesystem,)) if cfs: critical_filesets[filesystem] = cfs logger.info("Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem,)] = 1 logger.exception("Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets,)) if critical_filesets: mail_admins(critical_filesets, opts.options.dry_run) except Exception: logger.exception("Failure obtaining GPFS inodes") opts.critical("Failure to obtain GPFS inodes information") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("Logged GPFS inodes", stats)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), 'backend': ('Storage backend', None, 'store', 'gpfs'), } opts = ExtendedSimpleOption(options) stats = {} backend = opts.options.backend try: if backend == 'gpfs': storage_backend = GpfsOperations() elif backend == 'lustre': storage_backend = LustreOperations() else: logger.exception("Backend %s not supported", backend) quota = storage_backend.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) for key in quota: stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL try: filename = "%s_quota_%s_%s.gz" % ( backend, time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key]).encode()) zipfile.close() stats["%s_quota_log" % (key, )] = 0 logger.info("Stored quota information for FS %s", key) except Exception: stats["%s_quota_log" % (key, )] = 1 logger.exception("Failed storing quota information for FS %s", key) except Exception: logger.exception("Failure obtaining %s quota", backend) opts.critical("Failure to obtain %s quota information" % backend) opts.epilogue("Logged %s quota" % backend, stats)
def main(): storage_settings = VscStorage() local_storage_conf = configparser.SafeConfigParser() local_storage_conf.read(QUOTA_CONF_FILE) gpfs = GpfsOperations() gpfs.list_filesystems() gpfs.list_filesets() for storage_name in local_storage_conf.get('MAIN', 'storage').split(','): filesystem_name = storage_settings[storage_name].filesystem filesystem_info = gpfs.get_filesystem_info(filesystem_name) if storage_name in ('VSC_HOME'): set_up_filesystem(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name) set_up_apps(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name) else: set_up_filesystem(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name, vo_support=True)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) stats = {} try: gpfs = GpfsOperations() quota = gpfs.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0755) for key in quota: stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL try: filename = "gpfs_quota_%s_%s.gz" % ( time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key])) zipfile.close() stats["%s_quota_log" % (key, )] = 0 logger.info("Stored quota information for FS %s" % (key)) except Exception: stats["%s_quota_log" % (key, )] = 1 logger.exception("Failed storing quota information for FS %s" % (key)) except Exception: logger.exception("Failure obtaining GPFS quota") opts.critical("Failure to obtain GPFS quota information") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("Logged GPFS quota", stats)
def __init__(self, user_id, storage=None, pickle_storage=None, rest_client=None, account=None, pubkeys=None, host_institute=None, use_user_cache=False): """ Initialisation. @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5}) """ super(VscTier2AccountpageUser, self).__init__(user_id, rest_client, account=account, pubkeys=pubkeys, use_user_cache=use_user_cache) # Move to vsc-config? default_pickle_storage = { GENT: VSC_SCRATCH_KYUKON, BRUSSEL: VSC_SCRATCH_THEIA, } if host_institute is None: host_institute = GENT self.host_institute = host_institute if pickle_storage is None: pickle_storage = default_pickle_storage[host_institute] self.pickle_storage = pickle_storage if storage is None: storage = VscStorage() self.institute_path_templates = storage.path_templates[ self.host_institute] self.institute_storage = storage[self.host_institute] self.vsc = VSC() self.gpfs = GpfsOperations() # Only used when needed self.posix = PosixOperations()
def __init__(self, user_id, storage=None, pickle_storage='VSC_SCRATCH_KYUKON', rest_client=None, account=None, pubkeys=None, host_institute=None, use_user_cache=False): """ Initialisation. @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5}) """ super(VscTier2AccountpageUser, self).__init__(user_id, rest_client, account=account, pubkeys=pubkeys, use_user_cache=use_user_cache) self.pickle_storage = pickle_storage if not storage: self.storage = VscStorage() else: self.storage = storage self.vsc = VSC() self.gpfs = GpfsOperations() # Only used when needed self.posix = PosixOperations() self.host_institute = host_institute
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) filesystem_error = 0 filesystem_ok = 0 error = False stats = {} try: gpfs = GpfsOperations() quota = gpfs.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0755) for key in quota: stats["%s_quota_log_critical" % (key,)] = QUOTA_STORE_LOG_CRITICAL try: filename = "gpfs_quota_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key])) zipfile.close() stats["%s_quota_log" % (key,)] = 0 logger.info("Stored quota information for FS %s" % (key)) except Exception, err: stats["%s_quota_log" % (key,)] = 1 logger.exception("Failed storing quota information for FS %s" % (key)) except Exception, err: logger.exception("Failure obtaining GPFS quota") opts.critical("Failure to obtain GPFS quota information") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """ Main script. - process the users and VOs - write the new timestamp if everything went OK - write the nagios check file """ options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('storage systems on which to deploy users and vos', None, 'extend', []), } opts = ExtendedSimpleOption(options) stats = {} try: storage_settings = VscStorage() gpfs = GpfsOperations() gpfs.list_filesystems() gpfs.list_filesets() for storage_name in opts.options.storage: filesystem_name = storage_settings[storage_name].filesystem filesystem_info = gpfs.get_filesystem_info(filesystem_name) set_up_filesystem(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name, vo_support=True, dry_run=opts.options.dry_run) except Exception as err: logging.exception("critical exception caught: %s", err) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("UGent users and VOs synchronised", stats)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios': ('print out nagios information', None, 'store_true', False, 'n'), 'nagios-check-filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME), 'nagios-check-interval-threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD), 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), 'ha': ('high-availability master IP address', None, 'store', None), 'dry-run': ('do not make any updates whatsoever', None, 'store_true', False), } opts = simple_option(options) nagios_reporter = NagiosReporter(NAGIOS_HEADER, opts.options.nagios_check_filename, opts.options.nagios_check_interval_threshold) if opts.options.nagios: logger.debug("Producing Nagios report and exiting.") nagios_reporter.report_and_exit() sys.exit(0) # not reached if not proceed_on_ha_service(opts.options.ha): logger.warning("Not running on the target host in the HA setup. Stopping.") nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master.")) sys.exit(NAGIOS_EXIT_WARNING) lockfile = TimestampedPidLockfile(QUOTA_LOG_LOCK_FILE) lock_or_bork(lockfile, nagios_reporter) logger.info("starting quota_log run") filesystem_error = 0 filesystem_ok = 0 error = False try: gpfs = GpfsOperations() quota = gpfs.list_quota() for key in quota: try: filename = "gpfs_quota_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key])) zipfile.close() filesystem_ok += 1 logger.info("Stored quota information for FS %s" % (key)) except Exception, err: logger.exception("Failed storing quota information for FS %s" % (key)) filesystem_error += 1 except Exception, err: logger.exception("Failure obtaining GPFS quota") error = True
class VscTier2AccountpageUser(VscAccountPageUser): """ A user on each of our Tier-2 system using the account page REST API to retrieve its information. """ def __init__(self, user_id, storage=None, pickle_storage='VSC_SCRATCH_KYUKON', rest_client=None, account=None, pubkeys=None, host_institute=None, use_user_cache=False): """ Initialisation. @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5}) """ super(VscTier2AccountpageUser, self).__init__(user_id, rest_client, account=account, pubkeys=pubkeys, use_user_cache=use_user_cache) self.pickle_storage = pickle_storage if not storage: self.storage = VscStorage() else: self.storage = storage self.vsc = VSC() self.gpfs = GpfsOperations() # Only used when needed self.posix = PosixOperations() self.host_institute = host_institute def _init_cache(self, **kwargs): super(VscTier2AccountpageUser, self)._init_cache(**kwargs) self._cache['quota'] = {} @property def user_home_quota(self): if not self._cache['quota']: self._init_quota_cache() return self._cache['quota']['home'] @property def user_data_quota(self): if not self._cache['quota']: self._init_quota_cache() return self._cache['quota']['data'] @property def user_scratch_quota(self): if not self._cache['quota']: self._init_quota_cache() return self._cache['quota']['scratch'] @property def vo_data_quota(self): if not self._cache['quota']: self._init_quota_cache() return self._cache['quota']['vo']['data'] @property def vo_scratch_quota(self): if not self._cache['quota']: self._init_quota_cache() return self._cache['quota']['vo']['scratch'] def _init_quota_cache(self): if self.host_institute is None: logging.warn("_init_quota_cache with host_institute None") all_quota = [mkVscUserSizeQuota(q) for q in self.rest_client.account[self.user_id].quota.get()[1]] # we no longer set defaults, since we do not want to accidentally revert people to some default # that is lower than their actual quota if the accountpage goes down in between retrieving the users # and fetching the quota institute_quota = [q for q in all_quota if q.storage['institute'] == self.host_institute] fileset_name = self.vsc.user_grouping_fileset(self.account.vsc_id) def user_proposition(quota, storage_type): return quota.fileset == fileset_name and quota.storage['storage_type'] == storage_type # Non-UGent users who have quota in Gent, e.g., in a VO, should not have these set if self.person.institute['site'] == self.host_institute: self._cache['quota']['home'] = [q.hard for q in institute_quota if user_proposition(q, 'home')][0] self._cache['quota']['data'] = [q.hard for q in institute_quota if user_proposition(q, 'data') and not q.storage['name'].endswith('SHARED')][0] self._cache['quota']['scratch'] = filter(lambda q: user_proposition(q, 'scratch'), institute_quota) else: self._cache['quota']['home'] = None self._cache['quota']['data'] = None self._cache['quota']['scratch'] = None fileset_name = 'gvo' def user_vo_proposition(quota, storage_type): return quota.fileset.startswith(fileset_name) and quota.storage['storage_type'] == storage_type self._cache['quota']['vo'] = {} self._cache['quota']['vo']['data'] = [q for q in institute_quota if user_vo_proposition(q, 'data')] self._cache['quota']['vo']['scratch'] = [q for q in institute_quota if user_vo_proposition(q, 'scratch')] def pickle_path(self): """Provide the location where to store pickle files for this user. This location is the user'path on the pickle_storage specified when creating a VscTier2AccountpageUser instance. """ (path, _) = self.storage.path_templates[GENT][self.pickle_storage]['user'](self.account.vsc_id) return os.path.join(self.storage[self.pickle_storage].gpfs_mount_point, path) def _create_grouping_fileset(self, filesystem_name, path, fileset_name): """Create a fileset for a group of 100 user accounts - creates the fileset if it does not already exist """ self.gpfs.list_filesets() logging.info("Trying to create the grouping fileset %s with link path %s", fileset_name, path) if not self.gpfs.get_fileset_info(filesystem_name, fileset_name): logging.info("Creating new fileset on %s with name %s and path %s", filesystem_name, fileset_name, path) base_dir_hierarchy = os.path.dirname(path) self.gpfs.make_dir(base_dir_hierarchy) self.gpfs.make_fileset(path, fileset_name) else: logging.info("Fileset %s already exists for user group of %s ... not creating again.", fileset_name, self.account.vsc_id) self.gpfs.chmod(0o755, path) def _get_mount_path(self, storage_name, mount_point): """Get the mount point for the location we're running""" if mount_point == "login": mount_path = self.storage[storage_name].login_mount_point elif mount_point == "gpfs": mount_path = self.storage[storage_name].gpfs_mount_point else: logging.error("mount_point (%s) is not login or gpfs", mount_point) raise Exception("mount_point (%s) is not designated as gpfs or login" % (mount_point,)) return mount_path def _get_path(self, storage_name, mount_point="gpfs"): """Get the path for the (if any) user directory on the given storage_name.""" (path, _) = self.storage.path_templates[GENT][storage_name]['user'](self.account.vsc_id) return os.path.join(self._get_mount_path(storage_name, mount_point), path) def _get_grouping_path(self, storage_name, mount_point="gpfs"): """Get the path and the fileset for the user group directory (and associated fileset).""" (path, fileset) = self.storage.path_templates[GENT][storage_name]['user'](self.account.vsc_id) return (os.path.join(self._get_mount_path(storage_name, mount_point), os.path.dirname(path)), fileset) def _home_path(self, mount_point="gpfs"): """Return the path to the home dir.""" return self._get_path(VSC_HOME, mount_point) def _data_path(self, mount_point="gpfs"): """Return the path to the data dir.""" return self._get_path(VSC_DATA, mount_point) def _scratch_path(self, storage_name, mount_point="gpfs"): """Return the path to the scratch dir""" return self._get_path(storage_name, mount_point) def _grouping_home_path(self, mount_point="gpfs"): """Return the path to the grouping fileset for the users on data.""" return self._get_grouping_path(VSC_HOME, mount_point) def _grouping_data_path(self, mount_point="gpfs"): """Return the path to the grouping fileset for the users on data.""" return self._get_grouping_path(VSC_DATA, mount_point) def _grouping_scratch_path(self, storage_name, mount_point="gpfs"): """Return the path to the grouping fileset for the users on the given scratch filesystem.""" return self._get_grouping_path(storage_name, mount_point) def _create_user_dir(self, grouping_f, path_f, storage_name): """Create the directories and files for some user location. @type grouping: function that yields the grouping path for the location. @type path: function that yields the actual path for the location. """ try: (grouping_path, fileset) = grouping_f() self._create_grouping_fileset(self.storage[storage_name].filesystem, grouping_path, fileset) path = path_f() if self.gpfs.is_symlink(path): logging.warning("Trying to make a user dir, but a symlink already exists at %s", path) return create_stat_directory( path, 0o700, int(self.account.vsc_id_number), int(self.usergroup.vsc_id_number), self.gpfs ) except Exception: logging.exception("Could not create dir %s for user %s", path, self.account.vsc_id) raise def create_home_dir(self): """Create all required files in the (future) user's home directory.""" self._create_user_dir(self._grouping_home_path, self._home_path, VSC_HOME) def create_data_dir(self): """Create the user's directory on the HPC data filesystem.""" self._create_user_dir(self._grouping_data_path, self._data_path, VSC_DATA) def create_scratch_dir(self, storage_name): """Create the user's directory on the given scratch filesystem.""" self._create_user_dir( lambda: self._grouping_scratch_path(storage_name), lambda: self._scratch_path(storage_name), storage_name) def _set_quota(self, storage_name, path, hard): """Set the given quota on the target path. @type path: path into a GPFS mount @type hard: hard limit """ if not hard: logging.error("No user quota set for %s", storage_name) return quota = hard * 1024 * self.storage[storage_name].data_replication_factor soft = int(self.vsc.quota_soft_fraction * quota) logging.info("Setting quota for %s on %s to %d", storage_name, path, quota) # LDAP information is expressed in KiB, GPFS wants bytes. self.gpfs.set_user_quota(soft, int(self.account.vsc_id_number), path, quota) self.gpfs.set_user_grace(path, self.vsc.user_storage_grace_time) # 7 days def set_home_quota(self): """Set USR quota on the home FS in the user fileset.""" path = self._home_path() hard = self.user_home_quota self._set_quota(VSC_HOME, path, hard) def set_data_quota(self): """Set USR quota on the data FS in the user fileset.""" (path, _) = self._grouping_data_path() hard = self.user_data_quota self._set_quota(VSC_DATA, path, hard) def set_scratch_quota(self, storage_name): """Set USR quota on the scratch FS in the user fileset.""" quota = filter(lambda q: q.storage['name'] in (storage_name,), self.user_scratch_quota) if not quota: logging.error("No scratch quota information available for %s", storage_name) return if self.storage[storage_name].user_grouping_fileset: (path, _) = self._grouping_scratch_path(storage_name) else: # Hack; this should actually become the link path of the fileset # that contains the path (the file, not the followed symlink) path = os.path.normpath(os.path.join(self._scratch_path(storage_name), '..')) self._set_quota(storage_name, path, quota[0].hard) def populate_home_dir(self): """Store the required files in the user's home directory. Does not overwrite files that may contain user defined content. """ path = self._home_path() self.gpfs.populate_home_dir(int(self.account.vsc_id_number), int(self.usergroup.vsc_id_number), path, [p.pubkey for p in self.pubkeys]) def __setattr__(self, name, value): """Override the setting of an attribute: - dry_run: set this here and in the gpfs and posix instance fields. - otherwise, call super's __setattr__() """ if name == 'dry_run': self.gpfs.dry_run = value self.posix.dry_run = value super(VscTier2AccountpageUser, self).__setattr__(name, value)
def main(): """Main script""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []), 'account_page_url': ('Base URL of the account page', None, 'store', 'https://account.vscentrum.be/django'), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), } opts = ExtendedSimpleOption(options) try: opener = urllib2.build_opener(urllib2.HTTPHandler) access_token = opts.options.access_token user_id_map = map_uids_to_names() # is this really necessary? LdapQuery(VscConfiguration()) gpfs = GpfsOperations() storage = VscStorage() target_filesystems = [storage[s].filesystem for s in opts.options.storage] filesystems = gpfs.list_filesystems(target_filesystems).keys() logger.debug("Found the following GPFS filesystems: %s" % (filesystems)) filesets = gpfs.list_filesets() logger.debug("Found the following GPFS filesets: %s" % (filesets)) quota = gpfs.list_quota() exceeding_filesets = {} exceeding_users = {} stats = {} for storage_name in opts.options.storage: logger.info("Processing quota for storage_name %s" % (storage_name)) filesystem = storage[storage_name].filesystem if filesystem not in filesystems: logger.error("Non-existant filesystem %s" % (filesystem)) continue if filesystem not in quota.keys(): logger.error("No quota defined for storage_name %s [%s]" % (storage_name, filesystem)) continue quota_storage_map = get_mmrepquota_maps(quota[filesystem], storage_name, filesystem, filesets) exceeding_filesets[storage_name] = process_fileset_quota(storage, gpfs, storage_name, filesystem, quota_storage_map['FILESET'], opener, opts.options.account_page_url, access_token, opts.options.dry_run) exceeding_users[storage_name] = process_user_quota(storage, gpfs, storage_name, filesystem, quota_storage_map['USR'], user_id_map, opener, opts.options.account_page_url, access_token, opts.options.dry_run) stats["%s_fileset_critical" % (storage_name,)] = QUOTA_FILESETS_CRITICAL if exceeding_filesets[storage_name]: stats["%s_fileset" % (storage_name,)] = 1 logger.warning("storage_name %s found %d filesets that are exceeding their quota" % (storage_name, len(exceeding_filesets))) for (e_fileset, e_quota) in exceeding_filesets[storage_name]: logger.warning("%s has quota %s" % (e_fileset, str(e_quota))) else: stats["%s_fileset" % (storage_name,)] = 0 logger.debug("storage_name %s found no filesets that are exceeding their quota" % storage_name) notify_exceeding_filesets(gpfs=gpfs, storage=storage_name, filesystem=filesystem, exceeding_items=exceeding_filesets[storage_name], dry_run=opts.options.dry_run) stats["%s_users_warning" % (storage_name,)] = QUOTA_USERS_WARNING stats["%s_users_critical" % (storage_name,)] = QUOTA_USERS_CRITICAL if exceeding_users[storage_name]: stats["%s_users" % (storage_name,)] = len(exceeding_users[storage_name]) logger.warning("storage_name %s found %d users who are exceeding their quota" % (storage_name, len(exceeding_users[storage_name]))) for (e_user_id, e_quota) in exceeding_users[storage_name]: logger.warning("%s has quota %s" % (e_user_id, str(e_quota))) else: stats["%s_users" % (storage_name,)] = 0 logger.debug("storage_name %s found no users who are exceeding their quota" % storage_name) notify_exceeding_users(gpfs=gpfs, storage=storage_name, filesystem=filesystem, exceeding_items=exceeding_users[storage_name], dry_run=opts.options.dry_run) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'information': ('the sort of information to store: user, vo, project', None, 'store', 'user'), 'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'), 'account_page_url': ('the URL at which the account page resides', None, 'store', None), 'access_token': ('the token that will allow authentication against the account page', None, 'store', None), 'target_master': ('the master used to execute showq commands', None, 'store', None), 'target_user': ('the user for ssh to the target master', None, 'store', None), } opts = ExtendedSimpleOption(options) try: rest_client = AccountpageClient(token=opts.options.access_token) gpfs = GpfsOperations() storage = VscStorage() storage_name = cluster_user_pickle_store_map[opts.options.location] login_mount_point = storage[storage_name].login_mount_point gpfs_mount_point = storage[storage_name].gpfs_mount_point clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") showq_path = opts.configfile_parser.get(host, "showq_path") clusters[host] = {'master': master, 'path': showq_path} logger.debug("clusters = %s" % (clusters, )) showq = SshShowq(opts.options.target_master, opts.options.target_user, clusters, cache_pickle=True, dry_run=opts.options.dry_run) logger.debug("Getting showq information ...") (queue_information, _, _) = showq.get_moab_command_information() timeinfo = time.time() active_users = queue_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Queue information: %s" % (queue_information)) # We need to determine which users should get an updated pickle. This depends on # - the active user set # - the information we want to provide on the cluster(set) where this script runs # At the same time, we need to determine the job information each user gets to see tup = (opts.options.information, active_users, queue_information, rest_client) (target_users, target_queue_information, user_map) = determine_target_information(*tup) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in target_users: try: path = get_pickle_path(opts.options.location, user, rest_client) user_queue_information = target_queue_information[user] user_queue_information['timeinfo'] = timeinfo store_on_gpfs(user, path, "showq", (user_queue_information, user_map[user]), gpfs, login_mount_point, gpfs_mount_point, ".showq.json.gz", opts.options.dry_run) nagios_user_count += 1 except Exception: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 stats["store_users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), 'backend': ('Storage backend', None, 'store', 'gpfs'), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), } opts = ExtendedSimpleOption(options) logger = opts.log stats = {} backend = opts.options.backend try: if backend == 'gpfs': storage_backend = GpfsOperations() elif backend == 'lustre': storage_backend = LustreOperations() else: logger.exception("Backend %s not supported" % backend) filesets = storage_backend.list_filesets() quota = storage_backend.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem, )] = INODE_STORE_LOG_CRITICAL try: filename = "%s_inodes_%s_%s.gz" % ( backend, time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem, )] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem], quota[filesystem]['FILESET'], threshold=0.9, storage=backend) logger.info("Processed inodes information for filesystem %s" % (filesystem, )) if cfs: critical_filesets[filesystem] = cfs logger.info( "Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem, )] = 1 logger.exception( "Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets, )) if critical_filesets: mail_admins(critical_filesets, dry_run=opts.options.dry_run, host_institute=opts.options.host_institute) except Exception: logger.exception("Failure obtaining %s inodes" % backend) opts.critical("Failure to obtain %s inodes information" % backend) opts.epilogue("Logged %s inodes" % backend, stats)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) logger = opts.log stats = {} try: gpfs = GpfsOperations() filesets = gpfs.list_filesets() quota = gpfs.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem, )] = INODE_STORE_LOG_CRITICAL try: filename = "gpfs_inodes_%s_%s.gz" % ( time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem, )] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem], quota[filesystem]['FILESET'], threshold=0.9) logger.info("Processed inodes information for filesystem %s" % (filesystem, )) if cfs: critical_filesets[filesystem] = cfs logger.info( "Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem, )] = 1 logger.exception( "Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets, )) if critical_filesets: mail_admins(critical_filesets, opts.options.dry_run) except Exception: logger.exception("Failure obtaining GPFS inodes") opts.critical("Failure to obtain GPFS inodes information") opts.epilogue("Logged GPFS inodes", stats)
def main(): """Main script""" options = { 'nagios': ('print out nagios information', None, 'store_true', False, 'n'), 'nagios-check-filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME), 'nagios-check-interval-threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD), 'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []), 'dry-run': ('do not make any updates whatsoever', None, 'store_true', False), } opts = simple_option(options) logger.info('started GPFS quota check run.') nagios_reporter = NagiosReporter(NAGIOS_HEADER, opts.options.nagios_check_filename, opts.options.nagios_check_interval_threshold) if opts.options.nagios: nagios_reporter.report_and_exit() sys.exit(0) # not reached lockfile = TimestampedPidLockfile(QUOTA_CHECK_LOCK_FILE) lock_or_bork(lockfile, nagios_reporter) try: user_id_map = map_uids_to_names() # is this really necessary? LdapQuery(VscConfiguration()) gpfs = GpfsOperations() filesystems = gpfs.list_filesystems().keys() logger.debug("Found the following GPFS filesystems: %s" % (filesystems)) filesets = gpfs.list_filesets() logger.debug("Found the following GPFS filesets: %s" % (filesets)) quota = gpfs.list_quota() for storage in opts.options.storage: logger.info("Processing quota for storage %s" % (storage)) filesystem = opts.configfile_parser.get(storage, 'filesystem') if filesystem not in filesystems: logger.error("Non-existant filesystem %s" % (filesystem)) continue if filesystem not in quota.keys(): logger.error("No quota defined for storage %s [%s]" % (storage, filesystem)) continue quota_storage_map = get_mmrepquota_maps(quota[filesystem], storage,filesystem, filesets) exceeding_filesets = process_fileset_quota(gpfs, storage, filesystem, quota_storage_map['FILESET']) exceeding_users = process_user_quota(gpfs, storage, filesystem, quota_storage_map['USR'], user_id_map) logger.warning("storage %s found %d filesets that are exceeding their quota: %s" % (storage, len(exceeding_filesets), exceeding_filesets)) logger.warning("storage %s found %d users who are exceeding their quota: %s" % (storage, len(exceeding_users), exceeding_users)) notify_exceeding_filesets(gpfs=gpfs, storage=storage, filesystem=filesystem, exceeding_items=exceeding_filesets, dry_run=opts.options.dry_run) notify_exceeding_users(gpfs=gpfs, storage=storage, filesystem=filesystem, exceeding_items=exceeding_users, dry_run=opts.options.dry_run) sys.exit(1) except Exception, err: logger.exception("critical exception caught: %s" % (err)) if not opts.options.dry_run: nagios_reporter.cache(NAGIOS_EXIT_CRITICAL, NagiosResult("CRITICAL script failed - %s" % (err.message))) if not opts.options.dry_run: lockfile.release() sys.exit(1)
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'), 'access_token': ('the token that will allow authentication against the account page', None, 'store', None), 'account_page_url': ('', None, 'store', None), 'target_master': ('the master used to execute showq commands', None, 'store', None), 'target_user': ('the user for ssh to the target master', None, 'store', None), } opts = ExtendedSimpleOption(options) try: rest_client = AccountpageClient(token=opts.options.access_token) gpfs = GpfsOperations() storage = VscStorage() storage_name = cluster_user_pickle_store_map[opts.options.location] login_mount_point = storage[storage_name].login_mount_point gpfs_mount_point = storage[storage_name].gpfs_mount_point clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") checkjob_path = opts.configfile_parser.get(host, "checkjob_path") clusters[host] = {'master': master, 'path': checkjob_path} checkjob = SshCheckjob(opts.options.target_master, opts.options.target_user, clusters, cache_pickle=True, dry_run=opts.options.dry_run) (job_information, _, _) = checkjob.get_moab_command_information() active_users = job_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Checkjob information: %s" % (job_information)) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in active_users: path = get_pickle_path(opts.options.location, user, rest_client) try: user_queue_information = CheckjobInfo( {user: job_information[user]}) store_on_gpfs(user, path, "checkjob", user_queue_information, gpfs, login_mount_point, gpfs_mount_point, ".checkjob.json.gz", opts.options.dry_run) nagios_user_count += 1 except Exception: logger.exception("Could not store cache file for user %s" % (user)) nagios_no_store += 1 stats["store_users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
class VscTier2AccountpageVo(VscAccountPageVo): """Class representing a VO in the VSC. A VO is a special kind of group, identified mainly by its name. """ def __init__(self, vo_id, storage=None, rest_client=None, host_institute=GENT): """Initialise""" super(VscTier2AccountpageVo, self).__init__(vo_id, rest_client) self.vo_id = vo_id self.vsc = VSC() self.host_institute = host_institute if not storage: self.storage = VscStorage() else: self.storage = storage self.gpfs = GpfsOperations() self.posix = PosixOperations() self.dry_run = False self._vo_data_quota_cache = None self._vo_data_shared_quota_cache = None self._vo_scratch_quota_cache = None self._institute_quota_cache = None self._sharing_group_cache = None @property def _institute_quota(self): if not self._institute_quota_cache: all_quota = [mkVscVoSizeQuota(q) for q in whenHTTPErrorRaise(self.rest_client.vo[self.vo.vsc_id].quota.get, "Could not get quotata from accountpage for VO %s" % self.vo.vsc_id)[1]] self._institute_quota_cache = [q for q in all_quota if q.storage['institute'] == self.host_institute] return self._institute_quota_cache def _get_institute_data_quota(self): return [q for q in self._institute_quota if q.storage['storage_type'] == DATA_KEY] def _get_institute_non_shared_data_quota(self): return [q.hard for q in self._get_institute_data_quota() if not q.storage['name'].endswith(STORAGE_SHARED_SUFFIX)] def _get_institute_shared_data_quota(self): return [q.hard for q in self._get_institute_data_quota() if q.storage['name'].endswith(STORAGE_SHARED_SUFFIX)] @property def vo_data_quota(self): if not self._vo_data_quota_cache: self._vo_data_quota_cache = self._get_institute_non_shared_data_quota() if not self._vo_data_quota_cache: self._vo_data_quota_cache = [self.storage[VSC_DATA].quota_vo] return self._vo_data_quota_cache[0] # there can be only one @property def vo_data_shared_quota(self): if not self._vo_data_shared_quota_cache: try: self._vo_data_shared_quota_cache = self._get_institute_shared_data_quota()[0] except IndexError: return None return self._vo_data_shared_quota_cache @property def vo_scratch_quota(self): if not self._vo_scratch_quota_cache: self._vo_scratch_quota_cache = [q for q in self._institute_quota if q.storage['storage_type'] == SCRATCH_KEY] return self._vo_scratch_quota_cache @property def sharing_group(self): if not self.data_sharing: return None if not self._sharing_group_cache: group_name = self.vo.vsc_id.replace(VO_PREFIX_BY_INSTITUTE[self.vo.institute['name']], VO_SHARED_PREFIX_BY_INSTITUTE[self.vo.institute['name']]) self._sharing_group_cache = mkVscAutogroup( whenHTTPErrorRaise(self.rest_client.autogroup[group_name].get, "Could not get autogroup %s details" % group_name)[1]) return self._sharing_group_cache @property def data_sharing(self): return self.vo_data_shared_quota is not None def members(self): """Return a list with all the VO members in it.""" return self.vo.members def _get_path(self, storage, mount_point="gpfs"): """Get the path for the (if any) user directory on the given storage.""" (path, _) = self.storage.path_templates[self.host_institute][storage]['vo'](self.vo.vsc_id) if mount_point == "login": mount_path = self.storage[self.host_institute][storage].login_mount_point elif mount_point == "gpfs": mount_path = self.storage[self.host_institute][storage].gpfs_mount_point else: logging.error("mount_point (%s)is not login or gpfs", mount_point) raise Exception() return os.path.join(mount_path, path) def _data_path(self, mount_point="gpfs"): """Return the path to the VO data fileset on GPFS""" return self._get_path(VSC_DATA, mount_point) def _data_shared_path(self, mount_point="gpfs"): """Return the path the VO shared data fileset on GPFS""" return self._get_path(VSC_DATA_SHARED, mount_point) def _scratch_path(self, storage, mount_point="gpfs"): """Return the path to the VO scratch fileset on GPFS. @type storage: string @param storage: name of the storage we are looking at. """ return self._get_path(storage, mount_point) def _create_fileset(self, filesystem_name, path, parent_fileset=None, fileset_name=None, group_owner_id=None): """Create a fileset for the VO on the data filesystem. - creates the fileset if it does not already exist - sets ownership to the first (active) VO moderator, or to nobody if there is no moderator - sets group ownership to the supplied value (group_owner_id) or if that is missing to the vsc_id of the VO owning the fileset The parent_fileset is used to support older (< 3.5.x) GPFS setups still present in our system """ self.gpfs.list_filesets() if not fileset_name: fileset_name = self.vo.vsc_id if group_owner_id: fileset_group_owner_id = group_owner_id else: fileset_group_owner_id = self.vo.vsc_id_number if not self.gpfs.get_fileset_info(filesystem_name, fileset_name): logging.info("Creating new fileset on %s with name %s and path %s", filesystem_name, fileset_name, path) base_dir_hierarchy = os.path.dirname(path) self.gpfs.make_dir(base_dir_hierarchy) # HACK to support versions older than 3.5 in our setup if parent_fileset is None: self.gpfs.make_fileset(path, fileset_name) else: self.gpfs.make_fileset(path, fileset_name, parent_fileset) else: logging.info("Fileset %s already exists for VO %s ... not creating again.", fileset_name, self.vo.vsc_id) self.gpfs.chmod(0o770, path) try: moderator = mkVscAccount(self.rest_client.account[self.vo.moderators[0]].get()[1]) except HTTPError: logging.exception("Cannot obtain moderator information from account page, setting ownership to nobody") self.gpfs.chown(pwd.getpwnam('nobody').pw_uid, fileset_group_owner_id, path) except IndexError: logging.error("There is no moderator available for VO %s", self.vo.vsc_id) self.gpfs.chown(pwd.getpwnam('nobody').pw_uid, fileset_group_owner_id, path) else: self.gpfs.chown(moderator.vsc_id_number, fileset_group_owner_id, path) def create_data_fileset(self): """Create the VO's directory on the HPC data filesystem. Always set the quota.""" path = self._data_path() try: fs = self.storage[self.host_institute][VSC_DATA].filesystem except AttributeError: logging.exception("Trying to access non-existent attribute 'filesystem' in the data storage instance") except KeyError: logging.exception("Trying to access non-existent field %s in the data storage dictionary", VSC_DATA) self._create_fileset(fs, path) def create_data_shared_fileset(self): """Create a VO directory for sharing data on the HPC data filesystem. Always set the quota.""" path = self._data_shared_path() msg = "Trying to access non-existent" try: fs = self.storage[self.host_institute][VSC_DATA_SHARED].filesystem except AttributeError: logging.exception("%s attribute 'filesystem' in the shared data storage instance", msg) except KeyError: logging.exception("%s field %s in the shared data storage dictionary", msg, VSC_DATA_SHARED) self._create_fileset(fs, path, fileset_name=self.sharing_group.vsc_id, group_owner_id=self.sharing_group.vsc_id_number) def create_scratch_fileset(self, storage_name): """Create the VO's directory on the HPC data filesystem. Always set the quota.""" msg = "Trying to access non-existent" try: path = self._scratch_path(storage_name) if self.storage[self.host_institute][storage_name].version >= (3, 5, 0, 0): self._create_fileset(self.storage[self.host_institute][storage_name].filesystem, path) else: self._create_fileset(self.storage[self.host_institute][storage_name].filesystem, path, 'root') except AttributeError: logging.exception("%s attribute 'filesystem' in the scratch storage instance", msg) except KeyError: logging.exception("%s field %s in the scratch storage dictionary", msg, storage_name) def _create_vo_dir(self, path): """Create a user owned directory on the GPFS.""" self.gpfs.make_dir(path) def _set_quota(self, storage_name, path, quota, fileset_name=None): """Set FILESET quota on the FS for the VO fileset. @type quota: int @param quota: soft quota limit expressed in KiB """ if not fileset_name: fileset_name = self.vo.vsc_id try: # expressed in bytes, retrieved in KiB from the backend hard = quota * 1024 * self.storage[self.host_institute][storage_name].data_replication_factor soft = int(hard * self.vsc.quota_soft_fraction) # LDAP information is expressed in KiB, GPFS wants bytes. self.gpfs.set_fileset_quota(soft, path, fileset_name, hard) self.gpfs.set_fileset_grace(path, self.vsc.vo_storage_grace_time) # 7 days except GpfsOperationError: logging.exception("Unable to set quota on path %s", path) raise def set_data_quota(self): """Set FILESET quota on the data FS for the VO fileset.""" if self.vo_data_quota: self._set_quota(VSC_DATA, self._data_path(), int(self.vo_data_quota)) else: self._set_quota(VSC_DATA, self._data_path(), 16 * 1024) def set_data_shared_quota(self): """Set FILESET quota on the data FS for the VO fileset.""" if self.vo_data_shared_quota: self._set_quota( VSC_DATA_SHARED, self._data_shared_path(), int(self.vo_data_shared_quota), fileset_name=self.vo.vsc_id.replace( VO_PREFIX_BY_INSTITUTE[self.vo.institute["name"]], VO_SHARED_PREFIX_BY_INSTITUTE[self.vo.institute["name"]], ), ) def set_scratch_quota(self, storage_name): """Set FILESET quota on the scratch FS for the VO fileset.""" quota = [q for q in self.vo_scratch_quota if q.storage['name'] in (storage_name,)] if not quota: logging.error("No VO %s scratch quota information available for %s", self.vo.vsc_id, storage_name) logging.info("Setting default VO %s scratch quota on storage %s to %d", self.vo.vsc_id, storage_name, self.storage[storage_name].quota_vo) self._set_quota(storage_name, self._scratch_path(storage_name), self.storage[storage_name].quota_vo) return elif len(quota) > 1: logging.exception("Cannot set scratch quota for %s with multiple quota instances %s", storage_name, quota) raise logging.info("Setting VO %s quota on storage %s to %d", self.vo.vsc_id, storage_name, quota[0].hard) self._set_quota(storage_name, self._scratch_path(storage_name), quota[0].hard) def _set_member_quota(self, storage_name, path, member, quota): """Set USER quota on the FS for the VO fileset @type member: VscTier2AccountpageUser @type quota: integer (hard value) """ try: hard = quota * 1024 * self.storage[self.host_institute][storage_name].data_replication_factor soft = int(hard * self.vsc.quota_soft_fraction) self.gpfs.set_user_quota(soft=soft, user=int(member.account.vsc_id_number), obj=path, hard=hard) except GpfsOperationError: logging.exception("Unable to set USR quota for member %s on path %s", member.account.vsc_id, path) raise def set_member_data_quota(self, member): """Set the quota on the data FS for the member in the VO fileset. @type member: VscTier2AccountPageUser instance The user can have up to half of the VO quota. FIXME: This should probably be some variable in a config setting instance """ if not self.vo_data_quota: logging.warning("Not setting VO %s member %s data quota: no VO data quota info available", self.vo.vsc_id, member.account.vsc_id) return if self.vo.vsc_id in DEFAULT_VOS_ALL: logging.warning("Not setting VO %s member %s data quota: No VO member quota for this VO", member.account.vsc_id, self.vo.vsc_id) return if member.vo_data_quota: # users having belonged to multiple VOs have multiple quota on VSC_DATA, so we # only need to deploy the quota for the VO the user currently belongs to. quota = [q for q in member.vo_data_quota if q.fileset == self.vo.vsc_id and not q.storage['name'].endswith(STORAGE_SHARED_SUFFIX)] if len(quota) > 1: logging.exception("Cannot set data quota for member %s with multiple quota instances %s", member, quota) raise else: logging.info("Setting the data quota for VO %s member %s to %d KiB", self.vo.vsc_id, member.account.vsc_id, quota[0].hard) self._set_member_quota(VSC_DATA, self._data_path(), member, quota[0].hard) else: logging.error("No VO %s data quota set for member %s", self.vo.vsc_id, member.account.vsc_id) def set_member_scratch_quota(self, storage_name, member): """Set the quota on the scratch FS for the member in the VO fileset. @type member: VscTier2AccountpageUser instance The user can have up to half of the VO quota. FIXME: This should probably be some variable in a config setting instance """ if not self.vo_scratch_quota: logging.warning("Not setting VO %s member %s scratch quota: no VO quota info available", self.vo.vsc_id, member.account.vsc_id) return if self.vo.vsc_id in DEFAULT_VOS_ALL: logging.warning("Not setting VO %s member %s scratch quota: No VO member quota for this VO", member.account.vsc_id, self.vo.vsc_id) return if member.vo_scratch_quota: quota = [q for q in member.vo_scratch_quota if q.storage['name'] in (storage_name,) and q.fileset in (self.vo_id,)] if quota: logging.info("Setting the scratch quota for VO %s member %s to %d GiB on %s", self.vo.vsc_id, member.account.vsc_id, quota[0].hard / 1024 / 1024, storage_name) self._set_member_quota(storage_name, self._scratch_path(storage_name), member, quota[0].hard) else: logging.error("No VO %s scratch quota for member %s on %s after filter (all %s)", self.vo.vsc_id, member.account.vsc_id, storage_name, member.vo_scratch_quota) else: logging.error("No VO %s scratch quota set for member %s on %s", self.vo.vsc_id, member.account.vsc_id, storage_name) def _create_member_dir(self, member, target): """Create a member-owned directory in the VO fileset.""" self.gpfs.create_stat_directory( target, 0o700, int(member.account.vsc_id_number), int(member.usergroup.vsc_id_number), # we should not override permissions on an existing dir where users may have changed them override_permissions=False) def create_member_data_dir(self, member): """Create a directory on data in the VO fileset that is owned by the member with name $VSC_DATA_VO/<vscid>.""" target = os.path.join(self._data_path(), member.user_id) self._create_member_dir(member, target) def create_member_scratch_dir(self, storage_name, member): """Create a directory on scratch in the VO fileset that is owned by the member with name $VSC_SCRATCH_VO/<vscid>.""" target = os.path.join(self._scratch_path(storage_name), member.user_id) self._create_member_dir(member, target) def __setattr__(self, name, value): """Override the setting of an attribute: - dry_run: set this here and in the gpfs and posix instance fields. - otherwise, call super's __setattr__() """ if name == 'dry_run': self.gpfs.dry_run = value self.posix.dry_run = value super(VscTier2AccountpageVo, self).__setattr__(name, value)
def main(): """Main script""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []), 'write-cache': ('Write the data into the cache files in the FS', None, 'store_true', False), 'account_page_url': ('Base URL of the account page', None, 'store', 'https://account.vscentrum.be/django'), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), } opts = ExtendedSimpleOption(options) logger = opts.log try: client = AccountpageClient(token=opts.options.access_token) user_id_map = map_uids_to_names() # is this really necessary? gpfs = GpfsOperations() storage = VscStorage() target_filesystems = [ storage[s].filesystem for s in opts.options.storage ] filesystems = gpfs.list_filesystems(device=target_filesystems).keys() logger.debug("Found the following GPFS filesystems: %s" % (filesystems)) filesets = gpfs.list_filesets(devices=target_filesystems) logger.debug("Found the following GPFS filesets: %s" % (filesets)) quota = gpfs.list_quota(devices=target_filesystems) exceeding_filesets = {} exceeding_users = {} stats = {} for storage_name in opts.options.storage: logger.info("Processing quota for storage_name %s" % (storage_name)) filesystem = storage[storage_name].filesystem replication_factor = storage[storage_name].data_replication_factor if filesystem not in filesystems: logger.error("Non-existent filesystem %s" % (filesystem)) continue if filesystem not in quota.keys(): logger.error("No quota defined for storage_name %s [%s]" % (storage_name, filesystem)) continue quota_storage_map = get_mmrepquota_maps( quota[filesystem], storage_name, filesystem, filesets, replication_factor, ) exceeding_filesets[storage_name] = process_fileset_quota( storage, gpfs, storage_name, filesystem, quota_storage_map['FILESET'], client, dry_run=opts.options.dry_run, institute=opts.options.host_institute) exceeding_users[storage_name] = process_user_quota( storage, gpfs, storage_name, None, quota_storage_map['USR'], user_id_map, client, dry_run=opts.options.dry_run, institute=opts.options.host_institute) stats["%s_fileset_critical" % (storage_name, )] = QUOTA_FILESETS_CRITICAL if exceeding_filesets[storage_name]: stats["%s_fileset" % (storage_name, )] = 1 logger.warning( "storage_name %s found %d filesets that are exceeding their quota", storage_name, len(exceeding_filesets)) for (e_fileset, e_quota) in exceeding_filesets[storage_name]: logger.warning("%s has quota %s" % (e_fileset, str(e_quota))) else: stats["%s_fileset" % (storage_name, )] = 0 logger.debug( "storage_name %s found no filesets that are exceeding their quota" % storage_name) stats["%s_users_warning" % (storage_name, )] = QUOTA_USERS_WARNING stats["%s_users_critical" % (storage_name, )] = QUOTA_USERS_CRITICAL if exceeding_users[storage_name]: stats["%s_users" % (storage_name, )] = len( exceeding_users[storage_name]) logger.warning( "storage_name %s found %d users who are exceeding their quota" % (storage_name, len(exceeding_users[storage_name]))) for (e_user_id, e_quota) in exceeding_users[storage_name]: logger.warning("%s has quota %s" % (e_user_id, str(e_quota))) else: stats["%s_users" % (storage_name, )] = 0 logger.debug( "storage_name %s found no users who are exceeding their quota" % storage_name) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") opts.epilogue("quota check completed", stats)