def main(args):
    """Main script."""

    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users',
                        None, 'store_true', False),
    }
    opts = ExtendedSimpleOption(options)

    try:
        vsc_config = VscConfiguration()
        LdapQuery(vsc_config)

        grace_users = get_user_with_status('grace')
        inactive_users = get_user_with_status('inactive')

        pbs_query = PBSQuery()

        t = time.ctime()
        jobs = pbs_query.getjobs()  # we just get them all

        removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users, opts.options.dry_run)
        removed_running = remove_running_jobs(jobs, inactive_users, opts.options.dry_run)

        if opts.options.mail_report and not opts.options.dry_run:
            if len(removed_queued) > 0 or len(removed_running) > 0:
                mail_report(t, removed_queued, removed_running)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 2
0
def main():
    """Main function"""
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts':
        ('the hosts/clusters that should be contacted for job information',
         None, 'extend', []),
        'cache':
        ('the location to store the cache with previous release hold data',
         None, 'store', RELEASEJOB_CACHE_FILE)
    }

    opts = ExtendedSimpleOption(options)

    try:
        # parse config file
        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            showq_path = opts.configfile_parser.get(host, "showq_path")
            mjobctl_path = opts.configfile_parser.get(host, "mjobctl_path")
            clusters[host] = {
                'master': master,
                'spath': showq_path,
                'mpath': mjobctl_path,
            }

        # process the new and previous data
        released_jobids, stats = process_hold(clusters,
                                              dry_run=opts.options.dry_run)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 3
0
def main():
    """Like, the main."""

    options = {
        'check':
        ('Gather information for a nagios check', None, 'store_true', False),
        'detailed':
        ('Report detailed information', None, 'store_true', False, 'D'),
        'groups': ('Report for groups', None, "extend", [], 'g'),
        'users': ('Report for users', None, "extend", [], 'u'),
        'show':
        ('Show details: %s' % ','.join(SHOW_LIST), "strlist", "store", None),
        'jobs': ("Jobid(s)", "strlist", "store", None),
    }

    global go
    go = ExtendedSimpleOption(options)

    try:

        if go.options.show:
            show_individual(
            )  # does not need to affect the cached nagios result?
            sys.exit(0)
        else:
            msg = show_summary()

    except Exception, err:
        go.log.exception("critical exception caught: %s" % err)
        go.critical("Script failed in a horrible way: %s" % err)
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 4
0
def main():
    """Like, the main."""

    options = {
        'check': ('Gather information for a nagios check', None, 'store_true', False),
        'detailed': ('Report detailed information', None, 'store_true', False, 'D'),
        'groups': ('Report for groups', None, "extend", [], 'g'),
        'users': ('Report for users', None, "extend", [], 'u'),
        'show': ('Show details: %s' % ','.join(SHOW_LIST), "strlist", "store", None),
        'jobs': ("Jobid(s)", "strlist", "store", None),
    }

    global go
    go = ExtendedSimpleOption(options)

    try:

        if go.options.show:
            show_individual()  # does not need to affect the cached nagios result?
            sys.exit(0)
        else:
            msg = show_summary()

    except Exception, err:
        go.log.exception("critical exception caught: %s" % err)
        go.critical("Script failed in a horrible way: %s" % err)
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 5
0
def main():
    """Main function"""
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []),
        'cache': ('the location to store the cache with previous release hold data', None, 'store',
                  RELEASEJOB_CACHE_FILE)
    }

    opts = ExtendedSimpleOption(options)

    try:
        # parse config file
        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            showq_path = opts.configfile_parser.get(host, "showq_path")
            mjobctl_path = opts.configfile_parser.get(host, "mjobctl_path")
            clusters[host] = {
                'master': master,
                'spath': showq_path,
                'mpath': mjobctl_path,
            }

        # process the new and previous data
        released_jobids, stats = process_hold(clusters, dry_run=opts.options.dry_run)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 6
0
def main():
    """Main script."""

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'mail-report':
        ('mail a report to the hpc-admin list with job list for gracing or inactive users',
         None, 'store_true', False),
    }
    opts = ExtendedSimpleOption(options)

    try:
        vsc_config = VscConfiguration(VSC_CONF_DEFAULT_FILENAME)
        LdapQuery(vsc_config)

        grace_users = get_user_with_status('grace')
        inactive_users = get_user_with_status('inactive')

        pbs_query = PBSQuery()

        t = time.ctime()
        jobs = pbs_query.getjobs()  # we just get them all

        removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users)
        removed_running = remove_running_jobs(jobs, inactive_users)

        if opts.options.mail_report and not opts.options.dry_run:
            if len(removed_queued) > 0 or len(removed_running) > 0:
                mail_report(t, removed_queued, removed_running)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 7
0
def main():
    """The script"""

    options = {
        'detailed': ('Report detailed information', None, 'store_true', False, 'D'),
        'moabxml': ('File containing moab XML data (only for testing)', None, 'store', None),
        'max-retries': ('Maximum number retries prior to going critical', 'int', 'store', 2),
        'retry-interval': ('Seconds in between retries', 'int', 'store', 60),
    }

    opts = ExtendedSimpleOption(options)

    msg = "show_stats completed (%d tries)"
    try:
        if opts.options.moabxml:
            try:
                moabxml = open(opts.options.moabxml).read()
            except (IOError, OSError):
                logger.raiseException('Failed to read moab xml from %s' % opts.options.moabxml)
        else:
            moabxml = None

        retry = 0
        for retry in xrange(0, opts.options.max_retries):
            moab_stats = showstats(xml=moabxml)
            if moab_stats:
                break
            else:
                logger.info("Sleeping after retry %d" % (retry + 1,))
                time.sleep(opts.options.retry_interval)

        if not moab_stats:
            logger.error("Moabs showstats dit not provide useful output after %d, likely timed out." % (retry + 1,))
            opts.critical("Moabs showstats failed running correctly (%d retries)" % (retry + 1,))
            sys.exit(NAGIOS_EXIT_CRITICAL)

        else:
            stats = moab_stats['summary']

            if opts.options.detailed:
                detailed_info_string = """Shortterm/Longterm efficiency %.3f/%.3f
Dedicate/total prochours %s/%s
Active/Total procs %s/%s""" % (stats['STE'], stats['LTE'],
                               stats['DPH'], stats['TPH'],
                               stats['CAP'], stats['CTP'],)
                logger.info("detailed result STE = %s LTE = %s DPH = %s TPH = %s CAP = %s CTP = %s" %
                            (stats['STE'], stats['LTE'],
                             stats['DPH'], stats['TPH'],
                             stats['CAP'], stats['CTP'],))
                print detailed_info_string

            info_string = "short %.3f long %.3f" % (stats['STE'], stats['LTE'])
            logger.info("result: %s" % (info_string,))
            msg = msg % (retry + 1,)
            msg += " %s" % (info_string,)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 8
0
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH),
    }

    opts = ExtendedSimpleOption(options)

    stats = {}

    try:
        gpfs = GpfsOperations()
        filesets = gpfs.list_filesets()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0755)

        critical_filesets = dict()

        for filesystem in filesets:
            stats["%s_inodes_log_critical" % (filesystem,)] = INODE_STORE_LOG_CRITICAL
            try:
                filename = "gpfs_inodes_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), filesystem)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(filesets[filesystem]))
                zipfile.close()
                stats["%s_inodes_log" % (filesystem,)] = 0
                logger.info("Stored inodes information for FS %s" % (filesystem))

                cfs = process_inodes_information(filesets[filesystem])
                logger.info("Processed inodes information for filesystem %s" % (filesystem,))
                if cfs:
                    critical_filesets[filesystem] = cfs
                    logger.info("Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs)))

            except Exception:
                stats["%s_inodes_log" % (filesystem,)] = 1
                logger.exception("Failed storing inodes information for FS %s" % (filesystem))

        logger.info("Critical filesets: %s" % (critical_filesets,))

        if critical_filesets:
            mail_admins(critical_filesets, opts.options.dry_run)

    except Exception:
        logger.exception("Failure obtaining GPFS inodes")
        opts.critical("Failure to obtain GPFS inodes information")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("Logged GPFS inodes", stats)
def main():
    """Main script."""

    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users',
                        None, 'store_true', False),
        'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None),
        'account_page_url': ('URL of the account page where we can find the REST API', None, 'store', None)
    }
    opts = ExtendedSimpleOption(options)

    try:
        now = datetime.datetime.utcnow()
        timestamp = now - datetime.timedelta(days=1)
        client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/")
        active_users, inactive_users = client.get_accounts()


        grace_users = []
        for a in active_users:
            try:
                if a.expiry_date and datetime.datetime.strptime(a.expiry_date, "%Y-%m-%d") - now < datetime.timedelta(days=7):
                    grace_users.append(a)
            except AttributeError as err:
                logger.debug("Account %s does not have expiry date", a.vsc_id)


        pbs_query = PBSQuery()

        t = time.ctime()
        jobs = pbs_query.getjobs()  # we just get them all

        removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users)
        removed_running = remove_running_jobs(jobs, inactive_users)

        if opts.options.mail_report and not opts.options.dry_run:
            if len(removed_queued) > 0 or len(removed_running) > 0:
                mail_report(t, removed_queued, removed_running)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 10
0
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH),
    }

    opts = ExtendedSimpleOption(options)

    filesystem_error = 0
    filesystem_ok = 0
    error = False

    stats = {}

    try:
        gpfs = GpfsOperations()
        quota = gpfs.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0755)

        for key in quota:
            stats["%s_quota_log_critical" % (key,)] = QUOTA_STORE_LOG_CRITICAL
            try:
                filename = "gpfs_quota_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), key)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(quota[key]))
                zipfile.close()
                stats["%s_quota_log" % (key,)] = 0
                logger.info("Stored quota information for FS %s" % (key))
            except Exception, err:
                stats["%s_quota_log" % (key,)] = 1
                logger.exception("Failed storing quota information for FS %s" % (key))
    except Exception, err:
        logger.exception("Failure obtaining GPFS quota")
        opts.critical("Failure to obtain GPFS quota information")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 11
0
    def test_threshold_default_setting(self, mock_proceed, mock_lock,
                                       mock_lockfile):
        """Test if the default value is set"""
        mock_proceed.return_value = True
        mock_lockfile.return_value = mock.MagicMock()

        opts = ExtendedSimpleOption(options={})
        self.assertEqual(opts.options.nagios_check_interval_threshold,
                         DEFAULT_OPTIONS['nagios-check-interval-threshold'][3])
        self.assertEqual(opts.nagios_reporter._threshold,
                         DEFAULT_OPTIONS['nagios-check-interval-threshold'][3])
        self.assertEqual(opts.nagios_reporter._cache_user, 'nagios')
        self.assertEqual(opts.options.nagios_user, 'nagios')
Esempio n. 12
0
    def test_threshold_custom_setting(self, mock_proceed, mock_lock,
                                      mock_lockfile):
        """Test if a custom value is passed on correctly"""
        mock_proceed.return_value = True
        mock_lockfile.return_value = mock.MagicMock()

        threshold = random.uniform(1, 1000)

        opts = ExtendedSimpleOption({
            'nagios-check-interval-threshold': threshold,
            'nagios-user': '******'
        })
        self.assertEqual(opts.options.nagios_check_interval_threshold,
                         threshold)
        self.assertEqual(opts.options.nagios_user, 'nrpe')
        self.assertEqual(opts.nagios_reporter._cache_user, 'nrpe')
Esempio n. 13
0
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH),
        'backend': ('Storage backend', None, 'store', 'gpfs'),
    }

    opts = ExtendedSimpleOption(options)

    stats = {}

    backend = opts.options.backend
    try:
        if backend == 'gpfs':
            storage_backend = GpfsOperations()
        elif backend == 'lustre':
            storage_backend = LustreOperations()
        else:
            logger.exception("Backend %s not supported", backend)

        quota = storage_backend.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        for key in quota:
            stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL
            try:
                filename = "%s_quota_%s_%s.gz" % (
                    backend, time.strftime("%Y%m%d-%H:%M"), key)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(quota[key]).encode())
                zipfile.close()
                stats["%s_quota_log" % (key, )] = 0
                logger.info("Stored quota information for FS %s", key)
            except Exception:
                stats["%s_quota_log" % (key, )] = 1
                logger.exception("Failed storing quota information for FS %s",
                                 key)
    except Exception:
        logger.exception("Failure obtaining %s quota", backend)
        opts.critical("Failure to obtain %s quota information" % backend)

    opts.epilogue("Logged %s quota" % backend, stats)
Esempio n. 14
0
def main():
    """
    Main script.
    - build the filter
    - fetches the users
    - process the users
    - write the new timestamp if everything went OK
    - write the nagios check file
    """

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('storage systems on which to deploy users and vos', None,
                    'extend', []),
        'user': ('process users', None, 'store_true', False),
        'vo': ('process vos', None, 'store_true', False),
        'access_token': ('OAuth2 token to access the account page REST API',
                         None, 'store', None),
        'account_page_url':
        ('URL of the account page where we can find the REST API', None,
         'store', None),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
        'start_timestamp':
        ('Timestamp to start the sync from', str, 'store', None),
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    (last_timestamp, start_time) = retrieve_timestamp_with_default(
        SYNC_TIMESTAMP_FILENAME, start_timestamp=opts.options.start_timestamp)
    logging.info("Using timestamp %s", last_timestamp)
    logging.info("Using startime %s", start_time)

    try:
        client = AccountpageClient(token=opts.options.access_token,
                                   url=opts.options.account_page_url + "/api/")

        institute = opts.options.host_institute

        (users_ok, users_fail) = ([], [])
        (quota_ok, quota_fail) = ([], [])
        if opts.options.user:
            changed_accounts = client.account.institute[institute].modified[
                last_timestamp].get()[1]

            logging.info(
                "Found %d %s accounts that have changed in the accountpage since %s"
                % (len(changed_accounts), institute, last_timestamp))

            accounts = nub([u['vsc_id'] for u in changed_accounts])

            for storage_name in opts.options.storage:
                (users_ok, users_fail) = process_users(opts.options, accounts,
                                                       storage_name, client,
                                                       institute)
                stats["%s_users_sync" % (storage_name, )] = len(users_ok)
                stats["%s_users_sync_fail" %
                      (storage_name, )] = len(users_fail)
                stats["%s_users_sync_fail_warning" %
                      (storage_name, )] = STORAGE_USERS_LIMIT_WARNING
                stats["%s_users_sync_fail_critical" %
                      (storage_name, )] = STORAGE_USERS_LIMIT_CRITICAL

            for storage_name in opts.options.storage:
                storage_changed_quota = [
                    mkVscUserSizeQuota(q) for q in client.quota.user.
                    storage[storage_name].modified[last_timestamp].get()[1]
                ]
                storage_changed_quota = [
                    q for q in storage_changed_quota
                    if q.fileset.startswith('vsc')
                ]
                logging.info(
                    "Found %d accounts that have changed quota on storage %s in the accountpage since %s",
                    len(storage_changed_quota), storage_name, last_timestamp)
                (quota_ok, quota_fail) = process_users_quota(
                    opts.options, storage_changed_quota, storage_name, client,
                    institute)
                stats["%s_quota_sync" % (storage_name, )] = len(quota_ok)
                stats["%s_quota_sync_fail" %
                      (storage_name, )] = len(quota_fail)
                stats["%s_quota_sync_fail_warning" %
                      (storage_name, )] = STORAGE_QUOTA_LIMIT_WARNING
                stats["%s_quota_sync_fail_critical" %
                      (storage_name, )] = STORAGE_QUOTA_LIMIT_CRITICAL

        (vos_ok, vos_fail) = ([], [])
        if opts.options.vo:
            changed_vos = client.vo.institute[institute].modified[
                last_timestamp].get()[1]
            changed_vo_quota = client.quota.vo.modified[last_timestamp].get(
            )[1]

            vos = sorted(
                set([v['vsc_id'] for v in changed_vos] +
                    [v['virtual_organisation'] for v in changed_vo_quota]))

            logging.info(
                "Found %d %s VOs that have changed in the accountpage since %s"
                % (len(changed_vos), institute, last_timestamp))
            logging.info(
                "Found %d %s VOs that have changed quota in the accountpage since %s"
                % (len(changed_vo_quota), institute, last_timestamp))
            logging.debug("Found the following {institute} VOs: {vos}".format(
                institute=institute, vos=vos))

            for storage_name in opts.options.storage:
                (vos_ok, vos_fail) = process_vos(opts.options, vos,
                                                 storage_name, client,
                                                 last_timestamp, institute)
                stats["%s_vos_sync" % (storage_name, )] = len(vos_ok)
                stats["%s_vos_sync_fail" % (storage_name, )] = len(vos_fail)
                stats["%s_vos_sync_fail_warning" %
                      (storage_name, )] = STORAGE_VO_LIMIT_WARNING
                stats["%s_vos_sync_fail_critical" %
                      (storage_name, )] = STORAGE_VO_LIMIT_CRITICAL

        if not (users_fail or quota_fail
                or vos_fail) and not opts.options.dry_run:
            (_, ldap_timestamp) = convert_timestamp(start_time)
            write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp)
    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("%s users and VOs synchronised" % institute, stats)
Esempio n. 15
0
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH),
        'backend': ('Storage backend', None, 'store', 'gpfs'),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
    }

    opts = ExtendedSimpleOption(options)
    logger = opts.log

    stats = {}

    backend = opts.options.backend
    try:
        if backend == 'gpfs':
            storage_backend = GpfsOperations()
        elif backend == 'lustre':
            storage_backend = LustreOperations()
        else:
            logger.exception("Backend %s not supported" % backend)

        filesets = storage_backend.list_filesets()
        quota = storage_backend.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        critical_filesets = dict()

        for filesystem in filesets:
            stats["%s_inodes_log_critical" %
                  (filesystem, )] = INODE_STORE_LOG_CRITICAL
            try:
                filename = "%s_inodes_%s_%s.gz" % (
                    backend, time.strftime("%Y%m%d-%H:%M"), filesystem)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(filesets[filesystem]))
                zipfile.close()
                stats["%s_inodes_log" % (filesystem, )] = 0
                logger.info("Stored inodes information for FS %s" %
                            (filesystem))

                cfs = process_inodes_information(filesets[filesystem],
                                                 quota[filesystem]['FILESET'],
                                                 threshold=0.9,
                                                 storage=backend)
                logger.info("Processed inodes information for filesystem %s" %
                            (filesystem, ))
                if cfs:
                    critical_filesets[filesystem] = cfs
                    logger.info(
                        "Filesystem %s has at least %d filesets reaching the limit"
                        % (filesystem, len(cfs)))

            except Exception:
                stats["%s_inodes_log" % (filesystem, )] = 1
                logger.exception(
                    "Failed storing inodes information for FS %s" %
                    (filesystem))

        logger.info("Critical filesets: %s" % (critical_filesets, ))

        if critical_filesets:
            mail_admins(critical_filesets,
                        dry_run=opts.options.dry_run,
                        host_institute=opts.options.host_institute)

    except Exception:
        logger.exception("Failure obtaining %s inodes" % backend)
        opts.critical("Failure to obtain %s inodes information" % backend)

    opts.epilogue("Logged %s inodes" % backend, stats)
Esempio n. 16
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts':
        ('the hosts/clusters that should be contacted for job information',
         None, 'extend', []),
        'information': ('the sort of information to store: user, vo, project',
                        None, 'store', 'user'),
        'location': ('the location for storing the pickle file: delcatty, muk',
                     str, 'store', 'delcatty'),
        'account_page_url':
        ('the URL at which the account page resides', None, 'store', None),
        'access_token':
        ('the token that will allow authentication against the account page',
         None, 'store', None),
        'target_master':
        ('the master used to execute showq commands', None, 'store', None),
        'target_user': ('the user for ssh to the target master', None, 'store',
                        None),
    }

    opts = ExtendedSimpleOption(options)

    try:
        rest_client = AccountpageClient(token=opts.options.access_token)

        gpfs = GpfsOperations()
        storage = VscStorage()
        storage_name = cluster_user_pickle_store_map[opts.options.location]
        login_mount_point = storage[storage_name].login_mount_point
        gpfs_mount_point = storage[storage_name].gpfs_mount_point

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            showq_path = opts.configfile_parser.get(host, "showq_path")
            clusters[host] = {'master': master, 'path': showq_path}

        logger.debug("clusters = %s" % (clusters, ))
        showq = SshShowq(opts.options.target_master,
                         opts.options.target_user,
                         clusters,
                         cache_pickle=True,
                         dry_run=opts.options.dry_run)

        logger.debug("Getting showq information ...")

        (queue_information, _, _) = showq.get_moab_command_information()
        timeinfo = time.time()

        active_users = queue_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Queue information: %s" % (queue_information))

        # We need to determine which users should get an updated pickle. This depends on
        # - the active user set
        # - the information we want to provide on the cluster(set) where this script runs
        # At the same time, we need to determine the job information each user gets to see
        tup = (opts.options.information, active_users, queue_information,
               rest_client)
        (target_users, target_queue_information,
         user_map) = determine_target_information(*tup)

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in target_users:
            try:
                path = get_pickle_path(opts.options.location, user,
                                       rest_client)
                user_queue_information = target_queue_information[user]
                user_queue_information['timeinfo'] = timeinfo
                store_on_gpfs(user, path, "showq",
                              (user_queue_information, user_map[user]), gpfs,
                              login_mount_point, gpfs_mount_point,
                              ".showq.json.gz", opts.options.dry_run)
                nagios_user_count += 1
            except Exception:
                logger.error("Could not store pickle file for user %s" %
                             (user))
                nagios_no_store += 1

        stats["store_users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
def main():
    """
    Main script. The usual.
    """

    options = {
        "nagios-check-interval-threshold":
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        "access_token": ("OAuth2 token to access the account page REST API",
                         None, "store", None),
        "account_page_url": (
            "URL of the account page where we can find the REST API",
            str,
            "store",
            "https://apivsc.ugent.be/django",
        ),
        "clusters": (
            "Cluster(s) (comma-separated) to sync for. "
            "Overrides GENT_SLURM_COMPUTE_CLUSTERS that are in production.",
            str,
            "store",
            None,
        ),
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    try:
        client = AccountpageClient(token=opts.options.access_token,
                                   url=opts.options.account_page_url + "/api/")

        last_timestamp = "201804010000Z"  # the beginning of time

        logging.info("Last recorded timestamp was %s" % (last_timestamp))

        slurm_account_info = get_slurm_acct_info(SyncTypes.accounts)
        slurm_user_info = get_slurm_acct_info(SyncTypes.users)

        logging.debug("%d accounts found", len(slurm_account_info))
        logging.debug("%d users found", len(slurm_user_info))

        if opts.options.clusters is not None:
            clusters = opts.options.clusters.split(",")
        else:
            clusters = [
                c for c in GENT_SLURM_COMPUTE_CLUSTERS
                if c in GENT_PRODUCTION_COMPUTE_CLUSTERS
            ]

        sacctmgr_commands = []

        # make sure the institutes and the default accounts (VOs) are there for each cluster
        sacctmgr_commands += slurm_institute_accounts(slurm_account_info,
                                                      clusters)

        # All users belong to a VO, so fetching the VOs is necessary/
        account_page_vos = [mkVo(v) for v in client.vo.get()[1]]

        # The VOs do not track active state of users, so we need to fetch all accounts as well
        active_accounts = set(
            [a["vsc_id"] for a in client.account.get()[1] if a["isactive"]])

        # dictionary mapping the VO vsc_id on a tuple with the VO members and the VO itself
        account_page_members = dict([(vo.vsc_id, (set(vo.members), vo))
                                     for vo in account_page_vos])

        # process all regular VOs
        sacctmgr_commands += slurm_vo_accounts(account_page_vos,
                                               slurm_account_info, clusters)

        # process VO members
        sacctmgr_commands += slurm_user_accounts(account_page_members,
                                                 active_accounts,
                                                 slurm_user_info, clusters,
                                                 opts.options.dry_run)

        logging.info("Executing %d commands", len(sacctmgr_commands))

        if opts.options.dry_run:
            print("Commands to be executed:\n")
            print("\n".join([" ".join(c) for c in sacctmgr_commands]))
        else:
            execute_commands(sacctmgr_commands)

    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    if not opts.options.dry_run:
        opts.epilogue("Accounts synced to slurm", stats)
    else:
        logger.info("Dry run done")
Esempio n. 18
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts':
        ('the hosts/clusters that should be contacted for job information',
         None, 'extend', []),
        'location': ('the location for storing the pickle file: delcatty, muk',
                     str, 'store', 'delcatty'),
        'access_token':
        ('the token that will allow authentication against the account page',
         None, 'store', None),
        'account_page_url': ('', None, 'store', None),
        'target_master':
        ('the master used to execute showq commands', None, 'store', None),
        'target_user':
        ('the user for ssh to the target master', None, 'store', None),
    }

    opts = ExtendedSimpleOption(options)

    try:
        rest_client = AccountpageClient(token=opts.options.access_token)

        gpfs = GpfsOperations()
        storage = VscStorage()
        storage_name = cluster_user_pickle_store_map[opts.options.location]
        login_mount_point = storage[storage_name].login_mount_point
        gpfs_mount_point = storage[storage_name].gpfs_mount_point

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            checkjob_path = opts.configfile_parser.get(host, "checkjob_path")
            clusters[host] = {'master': master, 'path': checkjob_path}

        checkjob = SshCheckjob(opts.options.target_master,
                               opts.options.target_user,
                               clusters,
                               cache_pickle=True,
                               dry_run=opts.options.dry_run)

        (job_information, _, _) = checkjob.get_moab_command_information()

        active_users = job_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Checkjob information: %s" % (job_information))

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in active_users:
            path = get_pickle_path(opts.options.location, user, rest_client)
            try:
                user_queue_information = CheckjobInfo(
                    {user: job_information[user]})
                store_on_gpfs(user, path, "checkjob", user_queue_information,
                              gpfs, login_mount_point, gpfs_mount_point,
                              ".checkjob.json.gz", opts.options.dry_run)
                nagios_user_count += 1
            except Exception:
                logger.exception("Could not store cache file for user %s" %
                                 (user))
                nagios_no_store += 1
        stats["store_users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 19
0
def main():
    """Main script"""

    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []),
        'account_page_url': ('Base URL of the account page', None, 'store', 'https://account.vscentrum.be/django'),
        'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None),
    }
    opts = ExtendedSimpleOption(options)

    try:
        opener = urllib2.build_opener(urllib2.HTTPHandler)
        access_token = opts.options.access_token

        user_id_map = map_uids_to_names()  # is this really necessary?
        LdapQuery(VscConfiguration())
        gpfs = GpfsOperations()
        storage = VscStorage()

        target_filesystems = [storage[s].filesystem for s in opts.options.storage]

        filesystems = gpfs.list_filesystems(target_filesystems).keys()
        logger.debug("Found the following GPFS filesystems: %s" % (filesystems))

        filesets = gpfs.list_filesets()
        logger.debug("Found the following GPFS filesets: %s" % (filesets))

        quota = gpfs.list_quota()
        exceeding_filesets = {}
        exceeding_users = {}
        stats = {}

        for storage_name in opts.options.storage:

            logger.info("Processing quota for storage_name %s" % (storage_name))
            filesystem = storage[storage_name].filesystem

            if filesystem not in filesystems:
                logger.error("Non-existant filesystem %s" % (filesystem))
                continue

            if filesystem not in quota.keys():
                logger.error("No quota defined for storage_name %s [%s]" % (storage_name, filesystem))
                continue

            quota_storage_map = get_mmrepquota_maps(quota[filesystem], storage_name, filesystem, filesets)

            exceeding_filesets[storage_name] = process_fileset_quota(storage,
                                                                     gpfs,
                                                                     storage_name,
                                                                     filesystem,
                                                                     quota_storage_map['FILESET'],
                                                                     opener,
                                                                     opts.options.account_page_url,
                                                                     access_token,
                                                                     opts.options.dry_run)
            exceeding_users[storage_name] = process_user_quota(storage,
                                                               gpfs,
                                                               storage_name,
                                                               filesystem,
                                                               quota_storage_map['USR'],
                                                               user_id_map,
                                                               opener,
                                                               opts.options.account_page_url,
                                                               access_token,
                                                               opts.options.dry_run)

            stats["%s_fileset_critical" % (storage_name,)] = QUOTA_FILESETS_CRITICAL
            if exceeding_filesets[storage_name]:
                stats["%s_fileset" % (storage_name,)] = 1
                logger.warning("storage_name %s found %d filesets that are exceeding their quota" % (storage_name,
                                                                                                len(exceeding_filesets)))
                for (e_fileset, e_quota) in exceeding_filesets[storage_name]:
                    logger.warning("%s has quota %s" % (e_fileset, str(e_quota)))
            else:
                stats["%s_fileset" % (storage_name,)] = 0
                logger.debug("storage_name %s found no filesets that are exceeding their quota" % storage_name)

            notify_exceeding_filesets(gpfs=gpfs,
                                      storage=storage_name,
                                      filesystem=filesystem,
                                      exceeding_items=exceeding_filesets[storage_name],
                                      dry_run=opts.options.dry_run)

            stats["%s_users_warning" % (storage_name,)] = QUOTA_USERS_WARNING
            stats["%s_users_critical" % (storage_name,)] = QUOTA_USERS_CRITICAL
            if exceeding_users[storage_name]:
                stats["%s_users" % (storage_name,)] = len(exceeding_users[storage_name])
                logger.warning("storage_name %s found %d users who are exceeding their quota" %
                               (storage_name, len(exceeding_users[storage_name])))
                for (e_user_id, e_quota) in exceeding_users[storage_name]:
                    logger.warning("%s has quota %s" % (e_user_id, str(e_quota)))
            else:
                stats["%s_users" % (storage_name,)] = 0
                logger.debug("storage_name %s found no users who are exceeding their quota" % storage_name)

            notify_exceeding_users(gpfs=gpfs,
                                   storage=storage_name,
                                   filesystem=filesystem,
                                   exceeding_items=exceeding_users[storage_name],
                                   dry_run=opts.options.dry_run)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 20
0
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH),
    }

    opts = ExtendedSimpleOption(options)
    logger = opts.log

    stats = {}

    try:
        gpfs = GpfsOperations()
        filesets = gpfs.list_filesets()
        quota = gpfs.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        critical_filesets = dict()

        for filesystem in filesets:
            stats["%s_inodes_log_critical" %
                  (filesystem, )] = INODE_STORE_LOG_CRITICAL
            try:
                filename = "gpfs_inodes_%s_%s.gz" % (
                    time.strftime("%Y%m%d-%H:%M"), filesystem)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(filesets[filesystem]))
                zipfile.close()
                stats["%s_inodes_log" % (filesystem, )] = 0
                logger.info("Stored inodes information for FS %s" %
                            (filesystem))

                cfs = process_inodes_information(filesets[filesystem],
                                                 quota[filesystem]['FILESET'],
                                                 threshold=0.9)
                logger.info("Processed inodes information for filesystem %s" %
                            (filesystem, ))
                if cfs:
                    critical_filesets[filesystem] = cfs
                    logger.info(
                        "Filesystem %s has at least %d filesets reaching the limit"
                        % (filesystem, len(cfs)))

            except Exception:
                stats["%s_inodes_log" % (filesystem, )] = 1
                logger.exception(
                    "Failed storing inodes information for FS %s" %
                    (filesystem))

        logger.info("Critical filesets: %s" % (critical_filesets, ))

        if critical_filesets:
            mail_admins(critical_filesets, opts.options.dry_run)

    except Exception:
        logger.exception("Failure obtaining GPFS inodes")
        opts.critical("Failure to obtain GPFS inodes information")

    opts.epilogue("Logged GPFS inodes", stats)
Esempio n. 21
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []),
        'information': ('the sort of information to store: user, vo, project', None, 'store', 'user'),
        'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'),
        'account_page_url': ('the URL at which the account page resides', None, 'store', None),
        'access_token': ('the token that will allow authentication against the account page', None, 'store', None),
        'target_master': ('the master used to execute showq commands', None, 'store', None),
        'target_user': ('the user for ssh to the target master', None, 'store', None),
    }

    opts = ExtendedSimpleOption(options)

    try:
        rest_client = AccountpageClient(token=opts.options.access_token)

        gpfs = GpfsOperations()
        storage = VscStorage()
        storage_name = cluster_user_pickle_store_map[opts.options.location]
        login_mount_point = storage[storage_name].login_mount_point
        gpfs_mount_point = storage[storage_name].gpfs_mount_point

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            showq_path = opts.configfile_parser.get(host, "showq_path")
            clusters[host] = {
                'master': master,
                'path': showq_path
            }

        logger.debug("clusters = %s" % (clusters,))
        showq = MasterSshShowq(opts.options.target_master,
                               opts.options.target_user,
                               clusters,
                               cache_pickle=True,
                               dry_run=opts.options.dry_run)

        logger.debug("Getting showq information ...")

        (queue_information, reported_hosts, failed_hosts) = showq.get_moab_command_information()
        timeinfo = time.time()

        active_users = queue_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Queue information: %s" % (queue_information))

        # We need to determine which users should get an updated pickle. This depends on
        # - the active user set
        # - the information we want to provide on the cluster(set) where this script runs
        # At the same time, we need to determine the job information each user gets to see
        tup = (opts.options.information, active_users, queue_information, rest_client)
        (target_users, target_queue_information, user_map) = determine_target_information(*tup)

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in target_users:
            try:
                path = get_pickle_path(opts.options.location, user, rest_client)
                user_queue_information = target_queue_information[user]
                user_queue_information['timeinfo'] = timeinfo
                store_on_gpfs(user, path, "showq", (user_queue_information, user_map[user]), gpfs, login_mount_point,
                              gpfs_mount_point, ".showq.json.gz", opts.options.dry_run)
                nagios_user_count += 1
            except Exception:
                logger.error("Could not store pickle file for user %s" % (user))
                nagios_no_store += 1

        stats["store_users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 22
0
def main():
    """Main script"""

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('the VSC filesystems that are checked by this script',
                    None, 'extend', []),
        'write-cache': ('Write the data into the cache files in the FS', None,
                        'store_true', False),
        'account_page_url': ('Base URL of the account page', None, 'store',
                             'https://account.vscentrum.be/django'),
        'access_token': ('OAuth2 token to access the account page REST API',
                         None, 'store', None),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
    }
    opts = ExtendedSimpleOption(options)
    logger = opts.log

    try:
        client = AccountpageClient(token=opts.options.access_token)

        user_id_map = map_uids_to_names()  # is this really necessary?
        gpfs = GpfsOperations()
        storage = VscStorage()

        target_filesystems = [
            storage[s].filesystem for s in opts.options.storage
        ]

        filesystems = gpfs.list_filesystems(device=target_filesystems).keys()
        logger.debug("Found the following GPFS filesystems: %s" %
                     (filesystems))

        filesets = gpfs.list_filesets(devices=target_filesystems)
        logger.debug("Found the following GPFS filesets: %s" % (filesets))

        quota = gpfs.list_quota(devices=target_filesystems)
        exceeding_filesets = {}
        exceeding_users = {}
        stats = {}

        for storage_name in opts.options.storage:

            logger.info("Processing quota for storage_name %s" %
                        (storage_name))
            filesystem = storage[storage_name].filesystem
            replication_factor = storage[storage_name].data_replication_factor

            if filesystem not in filesystems:
                logger.error("Non-existent filesystem %s" % (filesystem))
                continue

            if filesystem not in quota.keys():
                logger.error("No quota defined for storage_name %s [%s]" %
                             (storage_name, filesystem))
                continue

            quota_storage_map = get_mmrepquota_maps(
                quota[filesystem],
                storage_name,
                filesystem,
                filesets,
                replication_factor,
            )

            exceeding_filesets[storage_name] = process_fileset_quota(
                storage,
                gpfs,
                storage_name,
                filesystem,
                quota_storage_map['FILESET'],
                client,
                dry_run=opts.options.dry_run,
                institute=opts.options.host_institute)

            exceeding_users[storage_name] = process_user_quota(
                storage,
                gpfs,
                storage_name,
                None,
                quota_storage_map['USR'],
                user_id_map,
                client,
                dry_run=opts.options.dry_run,
                institute=opts.options.host_institute)

            stats["%s_fileset_critical" %
                  (storage_name, )] = QUOTA_FILESETS_CRITICAL
            if exceeding_filesets[storage_name]:
                stats["%s_fileset" % (storage_name, )] = 1
                logger.warning(
                    "storage_name %s found %d filesets that are exceeding their quota",
                    storage_name, len(exceeding_filesets))
                for (e_fileset, e_quota) in exceeding_filesets[storage_name]:
                    logger.warning("%s has quota %s" %
                                   (e_fileset, str(e_quota)))
            else:
                stats["%s_fileset" % (storage_name, )] = 0
                logger.debug(
                    "storage_name %s found no filesets that are exceeding their quota"
                    % storage_name)

            stats["%s_users_warning" % (storage_name, )] = QUOTA_USERS_WARNING
            stats["%s_users_critical" %
                  (storage_name, )] = QUOTA_USERS_CRITICAL
            if exceeding_users[storage_name]:
                stats["%s_users" % (storage_name, )] = len(
                    exceeding_users[storage_name])
                logger.warning(
                    "storage_name %s found %d users who are exceeding their quota"
                    % (storage_name, len(exceeding_users[storage_name])))
                for (e_user_id, e_quota) in exceeding_users[storage_name]:
                    logger.warning("%s has quota %s" %
                                   (e_user_id, str(e_quota)))
            else:
                stats["%s_users" % (storage_name, )] = 0
                logger.debug(
                    "storage_name %s found no users who are exceeding their quota"
                    % storage_name)

    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")

    opts.epilogue("quota check completed", stats)
Esempio n. 23
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []),
        'location': ('the location for storing the pickle file: home, scratch', str, 'store', 'home'),
        'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'),
        'access_token': ('the token that will allow authentication against the account page', None, 'store', None),
        'account_page_url': ('',None, 'store', None),
        'target_master': ('the master used to execute showq commands', None, 'store', None),
        'target_user': ('the user for ssh to the target master', None, 'store', None),
    }

    opts = ExtendedSimpleOption(options)

    try:
        rest_client = AccountpageClient(token=opts.options.access_token)

        gpfs = GpfsOperations()
        storage = VscStorage()
        storage_name = cluster_user_pickle_store_map[opts.options.location]
        login_mount_point = storage[storage_name].login_mount_point
        gpfs_mount_point = storage[storage_name].gpfs_mount_point

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            checkjob_path = opts.configfile_parser.get(host, "checkjob_path")
            clusters[host] = {
                'master': master,
                'path': checkjob_path
            }

        checkjob = MasterSshCheckjob(
            opts.options.target_master,
            opts.options.target_user,
            clusters,
            cache_pickle=True,
            dry_run=opts.options.dry_run)

        (job_information, reported_hosts, failed_hosts) = checkjob.get_moab_command_information()

        active_users = job_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Checkjob information: %s" % (job_information))

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in active_users:
            path = get_pickle_path(opts.options.location, user, rest_client)
            try:
                user_queue_information = CheckjobInfo({user: job_information[user]})
                store_on_gpfs(user, path, "checkjob", user_queue_information, gpfs, login_mount_point,
                        gpfs_mount_point, ".checkjob.json.gz", opts.options.dry_run)
                nagios_user_count += 1
            except Exception:
                logger.exception("Could not store cache file for user %s" % (user))
                nagios_no_store += 1
        stats["store_users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Esempio n. 24
0
def main():
    """The script"""

    options = {
        "detailed": ("Report detailed information", None, "store_true", False, "D"),
        "moabxml": ("Use xml moab data from file (for testing)", None, "store", None),
        "max-retries": ("Maximum number retries prior to going critical", "int", "store", 2),
        "retry-interval": ("Seconds in between retries", "int", "store", 60),
    }

    opts = ExtendedSimpleOption(options)

    msg = "show_stats completed (%d tries)"
    try:
        if opts.options.moabxml:
            try:
                moabxml = open(opts.options.moabxml).read()
            except:
                logger.raiseException("Failed to read moab xml from %s" % opts.options.moabxml)
        else:
            moabxml = None

        for retry in xrange(0, opts.options.max_retries):
            moab_stats = showstats(xml=moabxml)
            if moab_stats:
                break
            else:
                logger.info("Sleeping after retry %d" % (retry + 1,))
                time.sleep(opts.options.retry_interval)

        if not moab_stats:
            logger.error("Moabs showstats dit not provide useful output after %d, likely timed out." % (retry + 1,))
            opts.critical("Moabs showstats failed running correctly (%d retries)" % (retry + 1,))
            sys.exit(NAGIOS_EXIT_CRITICAL)

        else:
            stats = moab_stats["summary"]

            if opts.options.detailed:
                detailed_info_string = """Shortterm/Longterm efficiency %.3f/%.3f
Dedicate/total prochours %s/%s
Active/Total procs %s/%s""" % (
                    stats["STE"],
                    stats["LTE"],
                    stats["DPH"],
                    stats["TPH"],
                    stats["CAP"],
                    stats["CTP"],
                )
                logger.info(
                    "detailed result STE = %s LTE = %s DPH = %s TPH = %s CAP = %s CTP = %s"
                    % (stats["STE"], stats["LTE"], stats["DPH"], stats["TPH"], stats["CAP"], stats["CTP"])
                )
                print detailed_info_string

            info_string = "short %.3f long %.3f" % (stats["STE"], stats["LTE"])
            logger.info("result: %s" % (info_string,))
            msg = msg % (retry + 1,)
            msg += " %s" % (info_string,)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)