Esempio n. 1
0
    def __init__(self,
                 vo_id,
                 storage=None,
                 rest_client=None,
                 host_institute=GENT):
        """Initialise"""
        super(VscTier2AccountpageVo, self).__init__(vo_id, rest_client)

        self.vo_id = vo_id
        self.vsc = VSC()
        self.host_institute = host_institute

        if not storage:
            self.storage = VscStorage()
        else:
            self.storage = storage

        self.gpfs = GpfsOperations()
        self.posix = PosixOperations()

        self.dry_run = False

        self._vo_data_quota_cache = None
        self._vo_data_shared_quota_cache = None
        self._vo_scratch_quota_cache = None
        self._institute_quota_cache = None

        self._sharing_group_cache = None
def main():

    storage_settings = VscStorage()

    local_storage_conf = configparser.SafeConfigParser()
    local_storage_conf.read(QUOTA_CONF_FILE)

    gpfs = GpfsOperations()
    gpfs.list_filesystems()
    gpfs.list_filesets()

    for storage_name in local_storage_conf.get('MAIN', 'storage').split(','):

        filesystem_name = storage_settings[storage_name].filesystem
        filesystem_info = gpfs.get_filesystem_info(filesystem_name)

        if storage_name in ('VSC_HOME'):
            set_up_filesystem(gpfs, storage_settings, storage_name,
                              filesystem_info, filesystem_name)
            set_up_apps(gpfs, storage_settings, storage_name, filesystem_info,
                        filesystem_name)
        else:
            set_up_filesystem(gpfs,
                              storage_settings,
                              storage_name,
                              filesystem_info,
                              filesystem_name,
                              vo_support=True)
Esempio n. 3
0
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH),
        'backend': ('Storage backend', None, 'store', 'gpfs'),
    }

    opts = ExtendedSimpleOption(options)

    stats = {}

    backend = opts.options.backend
    try:
        if backend == 'gpfs':
            storage_backend = GpfsOperations()
        elif backend == 'lustre':
            storage_backend = LustreOperations()
        else:
            logger.exception("Backend %s not supported", backend)

        quota = storage_backend.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        for key in quota:
            stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL
            try:
                filename = "%s_quota_%s_%s.gz" % (
                    backend, time.strftime("%Y%m%d-%H:%M"), key)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(quota[key]).encode())
                zipfile.close()
                stats["%s_quota_log" % (key, )] = 0
                logger.info("Stored quota information for FS %s", key)
            except Exception:
                stats["%s_quota_log" % (key, )] = 1
                logger.exception("Failed storing quota information for FS %s",
                                 key)
    except Exception:
        logger.exception("Failure obtaining %s quota", backend)
        opts.critical("Failure to obtain %s quota information" % backend)

    opts.epilogue("Logged %s quota" % backend, stats)
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH),
    }

    opts = ExtendedSimpleOption(options)

    stats = {}

    try:
        gpfs = GpfsOperations()
        quota = gpfs.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0755)

        for key in quota:
            stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL
            try:
                filename = "gpfs_quota_%s_%s.gz" % (
                    time.strftime("%Y%m%d-%H:%M"), key)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(quota[key]))
                zipfile.close()
                stats["%s_quota_log" % (key, )] = 0
                logger.info("Stored quota information for FS %s" % (key))
            except Exception:
                stats["%s_quota_log" % (key, )] = 1
                logger.exception("Failed storing quota information for FS %s" %
                                 (key))
    except Exception:
        logger.exception("Failure obtaining GPFS quota")
        opts.critical("Failure to obtain GPFS quota information")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("Logged GPFS quota", stats)
Esempio n. 5
0
    def __init__(self,
                 user_id,
                 storage=None,
                 pickle_storage=None,
                 rest_client=None,
                 account=None,
                 pubkeys=None,
                 host_institute=None,
                 use_user_cache=False):
        """
        Initialisation.
        @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5})
        """
        super(VscTier2AccountpageUser,
              self).__init__(user_id,
                             rest_client,
                             account=account,
                             pubkeys=pubkeys,
                             use_user_cache=use_user_cache)

        # Move to vsc-config?
        default_pickle_storage = {
            GENT: VSC_SCRATCH_KYUKON,
            BRUSSEL: VSC_SCRATCH_THEIA,
        }

        if host_institute is None:
            host_institute = GENT
        self.host_institute = host_institute

        if pickle_storage is None:
            pickle_storage = default_pickle_storage[host_institute]

        self.pickle_storage = pickle_storage
        if storage is None:
            storage = VscStorage()

        self.institute_path_templates = storage.path_templates[
            self.host_institute]
        self.institute_storage = storage[self.host_institute]

        self.vsc = VSC()
        self.gpfs = GpfsOperations()  # Only used when needed
        self.posix = PosixOperations()
Esempio n. 6
0
def main():
    """
    Main script.
    - process the users and VOs
    - write the new timestamp if everything went OK
    - write the nagios check file
    """

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('storage systems on which to deploy users and vos', None,
                    'extend', []),
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    try:
        storage_settings = VscStorage()
        gpfs = GpfsOperations()
        gpfs.list_filesystems()
        gpfs.list_filesets()

        for storage_name in opts.options.storage:

            filesystem_name = storage_settings[storage_name].filesystem
            filesystem_info = gpfs.get_filesystem_info(filesystem_name)

            set_up_filesystem(gpfs,
                              storage_settings,
                              storage_name,
                              filesystem_info,
                              filesystem_name,
                              vo_support=True,
                              dry_run=opts.options.dry_run)

    except Exception as err:
        logging.exception("critical exception caught: %s", err)
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("UGent users and VOs synchronised", stats)
Esempio n. 7
0
    def __init__(self, user_id, storage=None, pickle_storage='VSC_SCRATCH_KYUKON', rest_client=None,
                 account=None, pubkeys=None, host_institute=None, use_user_cache=False):
        """
        Initialisation.
        @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5})
        """
        super(VscTier2AccountpageUser, self).__init__(user_id, rest_client, account=account,
                                                      pubkeys=pubkeys, use_user_cache=use_user_cache)

        self.pickle_storage = pickle_storage
        if not storage:
            self.storage = VscStorage()
        else:
            self.storage = storage

        self.vsc = VSC()
        self.gpfs = GpfsOperations()  # Only used when needed
        self.posix = PosixOperations()
        self.host_institute = host_institute
Esempio n. 8
0
def main():
    """Main script"""

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('the VSC filesystems that are checked by this script',
                    None, 'extend', []),
        'write-cache': ('Write the data into the cache files in the FS', None,
                        'store_true', False),
        'account_page_url': ('Base URL of the account page', None, 'store',
                             'https://account.vscentrum.be/django'),
        'access_token': ('OAuth2 token to access the account page REST API',
                         None, 'store', None),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
    }
    opts = ExtendedSimpleOption(options)
    logger = opts.log

    try:
        client = AccountpageClient(token=opts.options.access_token)

        user_id_map = map_uids_to_names()  # is this really necessary?
        gpfs = GpfsOperations()
        storage = VscStorage()

        target_filesystems = [
            storage[s].filesystem for s in opts.options.storage
        ]

        filesystems = gpfs.list_filesystems(device=target_filesystems).keys()
        logger.debug("Found the following GPFS filesystems: %s" %
                     (filesystems))

        filesets = gpfs.list_filesets(devices=target_filesystems)
        logger.debug("Found the following GPFS filesets: %s" % (filesets))

        quota = gpfs.list_quota(devices=target_filesystems)
        exceeding_filesets = {}
        exceeding_users = {}
        stats = {}

        for storage_name in opts.options.storage:

            logger.info("Processing quota for storage_name %s" %
                        (storage_name))
            filesystem = storage[storage_name].filesystem
            replication_factor = storage[storage_name].data_replication_factor

            if filesystem not in filesystems:
                logger.error("Non-existent filesystem %s" % (filesystem))
                continue

            if filesystem not in quota.keys():
                logger.error("No quota defined for storage_name %s [%s]" %
                             (storage_name, filesystem))
                continue

            quota_storage_map = get_mmrepquota_maps(
                quota[filesystem],
                storage_name,
                filesystem,
                filesets,
                replication_factor,
            )

            exceeding_filesets[storage_name] = process_fileset_quota(
                storage,
                gpfs,
                storage_name,
                filesystem,
                quota_storage_map['FILESET'],
                client,
                dry_run=opts.options.dry_run,
                institute=opts.options.host_institute)

            exceeding_users[storage_name] = process_user_quota(
                storage,
                gpfs,
                storage_name,
                None,
                quota_storage_map['USR'],
                user_id_map,
                client,
                dry_run=opts.options.dry_run,
                institute=opts.options.host_institute)

            stats["%s_fileset_critical" %
                  (storage_name, )] = QUOTA_FILESETS_CRITICAL
            if exceeding_filesets[storage_name]:
                stats["%s_fileset" % (storage_name, )] = 1
                logger.warning(
                    "storage_name %s found %d filesets that are exceeding their quota",
                    storage_name, len(exceeding_filesets))
                for (e_fileset, e_quota) in exceeding_filesets[storage_name]:
                    logger.warning("%s has quota %s" %
                                   (e_fileset, str(e_quota)))
            else:
                stats["%s_fileset" % (storage_name, )] = 0
                logger.debug(
                    "storage_name %s found no filesets that are exceeding their quota"
                    % storage_name)

            stats["%s_users_warning" % (storage_name, )] = QUOTA_USERS_WARNING
            stats["%s_users_critical" %
                  (storage_name, )] = QUOTA_USERS_CRITICAL
            if exceeding_users[storage_name]:
                stats["%s_users" % (storage_name, )] = len(
                    exceeding_users[storage_name])
                logger.warning(
                    "storage_name %s found %d users who are exceeding their quota"
                    % (storage_name, len(exceeding_users[storage_name])))
                for (e_user_id, e_quota) in exceeding_users[storage_name]:
                    logger.warning("%s has quota %s" %
                                   (e_user_id, str(e_quota)))
            else:
                stats["%s_users" % (storage_name, )] = 0
                logger.debug(
                    "storage_name %s found no users who are exceeding their quota"
                    % storage_name)

    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")

    opts.epilogue("quota check completed", stats)
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH),
    }

    opts = ExtendedSimpleOption(options)
    logger = opts.log

    stats = {}

    try:
        gpfs = GpfsOperations()
        filesets = gpfs.list_filesets()
        quota = gpfs.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        critical_filesets = dict()

        for filesystem in filesets:
            stats["%s_inodes_log_critical" %
                  (filesystem, )] = INODE_STORE_LOG_CRITICAL
            try:
                filename = "gpfs_inodes_%s_%s.gz" % (
                    time.strftime("%Y%m%d-%H:%M"), filesystem)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(filesets[filesystem]))
                zipfile.close()
                stats["%s_inodes_log" % (filesystem, )] = 0
                logger.info("Stored inodes information for FS %s" %
                            (filesystem))

                cfs = process_inodes_information(filesets[filesystem],
                                                 quota[filesystem]['FILESET'],
                                                 threshold=0.9)
                logger.info("Processed inodes information for filesystem %s" %
                            (filesystem, ))
                if cfs:
                    critical_filesets[filesystem] = cfs
                    logger.info(
                        "Filesystem %s has at least %d filesets reaching the limit"
                        % (filesystem, len(cfs)))

            except Exception:
                stats["%s_inodes_log" % (filesystem, )] = 1
                logger.exception(
                    "Failed storing inodes information for FS %s" %
                    (filesystem))

        logger.info("Critical filesets: %s" % (critical_filesets, ))

        if critical_filesets:
            mail_admins(critical_filesets, opts.options.dry_run)

    except Exception:
        logger.exception("Failure obtaining GPFS inodes")
        opts.critical("Failure to obtain GPFS inodes information")

    opts.epilogue("Logged GPFS inodes", stats)
Esempio n. 10
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts':
        ('the hosts/clusters that should be contacted for job information',
         None, 'extend', []),
        'location': ('the location for storing the pickle file: delcatty, muk',
                     str, 'store', 'delcatty'),
        'access_token':
        ('the token that will allow authentication against the account page',
         None, 'store', None),
        'account_page_url': ('', None, 'store', None),
        'target_master':
        ('the master used to execute showq commands', None, 'store', None),
        'target_user':
        ('the user for ssh to the target master', None, 'store', None),
    }

    opts = ExtendedSimpleOption(options)

    try:
        rest_client = AccountpageClient(token=opts.options.access_token)

        gpfs = GpfsOperations()
        storage = VscStorage()
        storage_name = cluster_user_pickle_store_map[opts.options.location]
        login_mount_point = storage[storage_name].login_mount_point
        gpfs_mount_point = storage[storage_name].gpfs_mount_point

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            checkjob_path = opts.configfile_parser.get(host, "checkjob_path")
            clusters[host] = {'master': master, 'path': checkjob_path}

        checkjob = SshCheckjob(opts.options.target_master,
                               opts.options.target_user,
                               clusters,
                               cache_pickle=True,
                               dry_run=opts.options.dry_run)

        (job_information, _, _) = checkjob.get_moab_command_information()

        active_users = job_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Checkjob information: %s" % (job_information))

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in active_users:
            path = get_pickle_path(opts.options.location, user, rest_client)
            try:
                user_queue_information = CheckjobInfo(
                    {user: job_information[user]})
                store_on_gpfs(user, path, "checkjob", user_queue_information,
                              gpfs, login_mount_point, gpfs_mount_point,
                              ".checkjob.json.gz", opts.options.dry_run)
                nagios_user_count += 1
            except Exception:
                logger.exception("Could not store cache file for user %s" %
                                 (user))
                nagios_no_store += 1
        stats["store_users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 11
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts':
        ('the hosts/clusters that should be contacted for job information',
         None, 'extend', []),
        'information': ('the sort of information to store: user, vo, project',
                        None, 'store', 'user'),
        'location': ('the location for storing the pickle file: delcatty, muk',
                     str, 'store', 'delcatty'),
        'account_page_url':
        ('the URL at which the account page resides', None, 'store', None),
        'access_token':
        ('the token that will allow authentication against the account page',
         None, 'store', None),
        'target_master':
        ('the master used to execute showq commands', None, 'store', None),
        'target_user': ('the user for ssh to the target master', None, 'store',
                        None),
    }

    opts = ExtendedSimpleOption(options)

    try:
        rest_client = AccountpageClient(token=opts.options.access_token)

        gpfs = GpfsOperations()
        storage = VscStorage()
        storage_name = cluster_user_pickle_store_map[opts.options.location]
        login_mount_point = storage[storage_name].login_mount_point
        gpfs_mount_point = storage[storage_name].gpfs_mount_point

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            showq_path = opts.configfile_parser.get(host, "showq_path")
            clusters[host] = {'master': master, 'path': showq_path}

        logger.debug("clusters = %s" % (clusters, ))
        showq = SshShowq(opts.options.target_master,
                         opts.options.target_user,
                         clusters,
                         cache_pickle=True,
                         dry_run=opts.options.dry_run)

        logger.debug("Getting showq information ...")

        (queue_information, _, _) = showq.get_moab_command_information()
        timeinfo = time.time()

        active_users = queue_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Queue information: %s" % (queue_information))

        # We need to determine which users should get an updated pickle. This depends on
        # - the active user set
        # - the information we want to provide on the cluster(set) where this script runs
        # At the same time, we need to determine the job information each user gets to see
        tup = (opts.options.information, active_users, queue_information,
               rest_client)
        (target_users, target_queue_information,
         user_map) = determine_target_information(*tup)

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in target_users:
            try:
                path = get_pickle_path(opts.options.location, user,
                                       rest_client)
                user_queue_information = target_queue_information[user]
                user_queue_information['timeinfo'] = timeinfo
                store_on_gpfs(user, path, "showq",
                              (user_queue_information, user_map[user]), gpfs,
                              login_mount_point, gpfs_mount_point,
                              ".showq.json.gz", opts.options.dry_run)
                nagios_user_count += 1
            except Exception:
                logger.error("Could not store pickle file for user %s" %
                             (user))
                nagios_no_store += 1

        stats["store_users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 12
0
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH),
        'backend': ('Storage backend', None, 'store', 'gpfs'),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
    }

    opts = ExtendedSimpleOption(options)
    logger = opts.log

    stats = {}

    backend = opts.options.backend
    try:
        if backend == 'gpfs':
            storage_backend = GpfsOperations()
        elif backend == 'lustre':
            storage_backend = LustreOperations()
        else:
            logger.exception("Backend %s not supported" % backend)

        filesets = storage_backend.list_filesets()
        quota = storage_backend.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        critical_filesets = dict()

        for filesystem in filesets:
            stats["%s_inodes_log_critical" %
                  (filesystem, )] = INODE_STORE_LOG_CRITICAL
            try:
                filename = "%s_inodes_%s_%s.gz" % (
                    backend, time.strftime("%Y%m%d-%H:%M"), filesystem)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(filesets[filesystem]))
                zipfile.close()
                stats["%s_inodes_log" % (filesystem, )] = 0
                logger.info("Stored inodes information for FS %s" %
                            (filesystem))

                cfs = process_inodes_information(filesets[filesystem],
                                                 quota[filesystem]['FILESET'],
                                                 threshold=0.9,
                                                 storage=backend)
                logger.info("Processed inodes information for filesystem %s" %
                            (filesystem, ))
                if cfs:
                    critical_filesets[filesystem] = cfs
                    logger.info(
                        "Filesystem %s has at least %d filesets reaching the limit"
                        % (filesystem, len(cfs)))

            except Exception:
                stats["%s_inodes_log" % (filesystem, )] = 1
                logger.exception(
                    "Failed storing inodes information for FS %s" %
                    (filesystem))

        logger.info("Critical filesets: %s" % (critical_filesets, ))

        if critical_filesets:
            mail_admins(critical_filesets,
                        dry_run=opts.options.dry_run,
                        host_institute=opts.options.host_institute)

    except Exception:
        logger.exception("Failure obtaining %s inodes" % backend)
        opts.critical("Failure to obtain %s inodes information" % backend)

    opts.epilogue("Logged %s inodes" % backend, stats)