def main(args): """Main script.""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users', None, 'store_true', False), } opts = ExtendedSimpleOption(options) try: vsc_config = VscConfiguration() LdapQuery(vsc_config) grace_users = get_user_with_status('grace') inactive_users = get_user_with_status('inactive') pbs_query = PBSQuery() t = time.ctime() jobs = pbs_query.getjobs() # we just get them all removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users, opts.options.dry_run) removed_running = remove_running_jobs(jobs, inactive_users, opts.options.dry_run) if opts.options.mail_report and not opts.options.dry_run: if len(removed_queued) > 0 or len(removed_running) > 0: mail_report(t, removed_queued, removed_running) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """Main function""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'cache': ('the location to store the cache with previous release hold data', None, 'store', RELEASEJOB_CACHE_FILE) } opts = ExtendedSimpleOption(options) try: # parse config file clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") showq_path = opts.configfile_parser.get(host, "showq_path") mjobctl_path = opts.configfile_parser.get(host, "mjobctl_path") clusters[host] = { 'master': master, 'spath': showq_path, 'mpath': mjobctl_path, } # process the new and previous data released_jobids, stats = process_hold(clusters, dry_run=opts.options.dry_run) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """Like, the main.""" options = { 'check': ('Gather information for a nagios check', None, 'store_true', False), 'detailed': ('Report detailed information', None, 'store_true', False, 'D'), 'groups': ('Report for groups', None, "extend", [], 'g'), 'users': ('Report for users', None, "extend", [], 'u'), 'show': ('Show details: %s' % ','.join(SHOW_LIST), "strlist", "store", None), 'jobs': ("Jobid(s)", "strlist", "store", None), } global go go = ExtendedSimpleOption(options) try: if go.options.show: show_individual( ) # does not need to affect the cached nagios result? sys.exit(0) else: msg = show_summary() except Exception, err: go.log.exception("critical exception caught: %s" % err) go.critical("Script failed in a horrible way: %s" % err) sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """Like, the main.""" options = { 'check': ('Gather information for a nagios check', None, 'store_true', False), 'detailed': ('Report detailed information', None, 'store_true', False, 'D'), 'groups': ('Report for groups', None, "extend", [], 'g'), 'users': ('Report for users', None, "extend", [], 'u'), 'show': ('Show details: %s' % ','.join(SHOW_LIST), "strlist", "store", None), 'jobs': ("Jobid(s)", "strlist", "store", None), } global go go = ExtendedSimpleOption(options) try: if go.options.show: show_individual() # does not need to affect the cached nagios result? sys.exit(0) else: msg = show_summary() except Exception, err: go.log.exception("critical exception caught: %s" % err) go.critical("Script failed in a horrible way: %s" % err) sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """Main script.""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users', None, 'store_true', False), } opts = ExtendedSimpleOption(options) try: vsc_config = VscConfiguration(VSC_CONF_DEFAULT_FILENAME) LdapQuery(vsc_config) grace_users = get_user_with_status('grace') inactive_users = get_user_with_status('inactive') pbs_query = PBSQuery() t = time.ctime() jobs = pbs_query.getjobs() # we just get them all removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users) removed_running = remove_running_jobs(jobs, inactive_users) if opts.options.mail_report and not opts.options.dry_run: if len(removed_queued) > 0 or len(removed_running) > 0: mail_report(t, removed_queued, removed_running) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """The script""" options = { 'detailed': ('Report detailed information', None, 'store_true', False, 'D'), 'moabxml': ('File containing moab XML data (only for testing)', None, 'store', None), 'max-retries': ('Maximum number retries prior to going critical', 'int', 'store', 2), 'retry-interval': ('Seconds in between retries', 'int', 'store', 60), } opts = ExtendedSimpleOption(options) msg = "show_stats completed (%d tries)" try: if opts.options.moabxml: try: moabxml = open(opts.options.moabxml).read() except (IOError, OSError): logger.raiseException('Failed to read moab xml from %s' % opts.options.moabxml) else: moabxml = None retry = 0 for retry in xrange(0, opts.options.max_retries): moab_stats = showstats(xml=moabxml) if moab_stats: break else: logger.info("Sleeping after retry %d" % (retry + 1,)) time.sleep(opts.options.retry_interval) if not moab_stats: logger.error("Moabs showstats dit not provide useful output after %d, likely timed out." % (retry + 1,)) opts.critical("Moabs showstats failed running correctly (%d retries)" % (retry + 1,)) sys.exit(NAGIOS_EXIT_CRITICAL) else: stats = moab_stats['summary'] if opts.options.detailed: detailed_info_string = """Shortterm/Longterm efficiency %.3f/%.3f Dedicate/total prochours %s/%s Active/Total procs %s/%s""" % (stats['STE'], stats['LTE'], stats['DPH'], stats['TPH'], stats['CAP'], stats['CTP'],) logger.info("detailed result STE = %s LTE = %s DPH = %s TPH = %s CAP = %s CTP = %s" % (stats['STE'], stats['LTE'], stats['DPH'], stats['TPH'], stats['CAP'], stats['CTP'],)) print detailed_info_string info_string = "short %.3f long %.3f" % (stats['STE'], stats['LTE']) logger.info("result: %s" % (info_string,)) msg = msg % (retry + 1,) msg += " %s" % (info_string,) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) stats = {} try: gpfs = GpfsOperations() filesets = gpfs.list_filesets() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem,)] = INODE_STORE_LOG_CRITICAL try: filename = "gpfs_inodes_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem,)] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem]) logger.info("Processed inodes information for filesystem %s" % (filesystem,)) if cfs: critical_filesets[filesystem] = cfs logger.info("Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem,)] = 1 logger.exception("Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets,)) if critical_filesets: mail_admins(critical_filesets, opts.options.dry_run) except Exception: logger.exception("Failure obtaining GPFS inodes") opts.critical("Failure to obtain GPFS inodes information") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("Logged GPFS inodes", stats)
def main(): """Main script.""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users', None, 'store_true', False), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'account_page_url': ('URL of the account page where we can find the REST API', None, 'store', None) } opts = ExtendedSimpleOption(options) try: now = datetime.datetime.utcnow() timestamp = now - datetime.timedelta(days=1) client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/") active_users, inactive_users = client.get_accounts() grace_users = [] for a in active_users: try: if a.expiry_date and datetime.datetime.strptime(a.expiry_date, "%Y-%m-%d") - now < datetime.timedelta(days=7): grace_users.append(a) except AttributeError as err: logger.debug("Account %s does not have expiry date", a.vsc_id) pbs_query = PBSQuery() t = time.ctime() jobs = pbs_query.getjobs() # we just get them all removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users) removed_running = remove_running_jobs(jobs, inactive_users) if opts.options.mail_report and not opts.options.dry_run: if len(removed_queued) > 0 or len(removed_running) > 0: mail_report(t, removed_queued, removed_running) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) filesystem_error = 0 filesystem_ok = 0 error = False stats = {} try: gpfs = GpfsOperations() quota = gpfs.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0755) for key in quota: stats["%s_quota_log_critical" % (key,)] = QUOTA_STORE_LOG_CRITICAL try: filename = "gpfs_quota_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key])) zipfile.close() stats["%s_quota_log" % (key,)] = 0 logger.info("Stored quota information for FS %s" % (key)) except Exception, err: stats["%s_quota_log" % (key,)] = 1 logger.exception("Failed storing quota information for FS %s" % (key)) except Exception, err: logger.exception("Failure obtaining GPFS quota") opts.critical("Failure to obtain GPFS quota information") sys.exit(NAGIOS_EXIT_CRITICAL)
def test_threshold_default_setting(self, mock_proceed, mock_lock, mock_lockfile): """Test if the default value is set""" mock_proceed.return_value = True mock_lockfile.return_value = mock.MagicMock() opts = ExtendedSimpleOption(options={}) self.assertEqual(opts.options.nagios_check_interval_threshold, DEFAULT_OPTIONS['nagios-check-interval-threshold'][3]) self.assertEqual(opts.nagios_reporter._threshold, DEFAULT_OPTIONS['nagios-check-interval-threshold'][3]) self.assertEqual(opts.nagios_reporter._cache_user, 'nagios') self.assertEqual(opts.options.nagios_user, 'nagios')
def test_threshold_custom_setting(self, mock_proceed, mock_lock, mock_lockfile): """Test if a custom value is passed on correctly""" mock_proceed.return_value = True mock_lockfile.return_value = mock.MagicMock() threshold = random.uniform(1, 1000) opts = ExtendedSimpleOption({ 'nagios-check-interval-threshold': threshold, 'nagios-user': '******' }) self.assertEqual(opts.options.nagios_check_interval_threshold, threshold) self.assertEqual(opts.options.nagios_user, 'nrpe') self.assertEqual(opts.nagios_reporter._cache_user, 'nrpe')
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), 'backend': ('Storage backend', None, 'store', 'gpfs'), } opts = ExtendedSimpleOption(options) stats = {} backend = opts.options.backend try: if backend == 'gpfs': storage_backend = GpfsOperations() elif backend == 'lustre': storage_backend = LustreOperations() else: logger.exception("Backend %s not supported", backend) quota = storage_backend.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) for key in quota: stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL try: filename = "%s_quota_%s_%s.gz" % ( backend, time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key]).encode()) zipfile.close() stats["%s_quota_log" % (key, )] = 0 logger.info("Stored quota information for FS %s", key) except Exception: stats["%s_quota_log" % (key, )] = 1 logger.exception("Failed storing quota information for FS %s", key) except Exception: logger.exception("Failure obtaining %s quota", backend) opts.critical("Failure to obtain %s quota information" % backend) opts.epilogue("Logged %s quota" % backend, stats)
def main(): """ Main script. - build the filter - fetches the users - process the users - write the new timestamp if everything went OK - write the nagios check file """ options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('storage systems on which to deploy users and vos', None, 'extend', []), 'user': ('process users', None, 'store_true', False), 'vo': ('process vos', None, 'store_true', False), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'account_page_url': ('URL of the account page where we can find the REST API', None, 'store', None), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), 'start_timestamp': ('Timestamp to start the sync from', str, 'store', None), } opts = ExtendedSimpleOption(options) stats = {} (last_timestamp, start_time) = retrieve_timestamp_with_default( SYNC_TIMESTAMP_FILENAME, start_timestamp=opts.options.start_timestamp) logging.info("Using timestamp %s", last_timestamp) logging.info("Using startime %s", start_time) try: client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/") institute = opts.options.host_institute (users_ok, users_fail) = ([], []) (quota_ok, quota_fail) = ([], []) if opts.options.user: changed_accounts = client.account.institute[institute].modified[ last_timestamp].get()[1] logging.info( "Found %d %s accounts that have changed in the accountpage since %s" % (len(changed_accounts), institute, last_timestamp)) accounts = nub([u['vsc_id'] for u in changed_accounts]) for storage_name in opts.options.storage: (users_ok, users_fail) = process_users(opts.options, accounts, storage_name, client, institute) stats["%s_users_sync" % (storage_name, )] = len(users_ok) stats["%s_users_sync_fail" % (storage_name, )] = len(users_fail) stats["%s_users_sync_fail_warning" % (storage_name, )] = STORAGE_USERS_LIMIT_WARNING stats["%s_users_sync_fail_critical" % (storage_name, )] = STORAGE_USERS_LIMIT_CRITICAL for storage_name in opts.options.storage: storage_changed_quota = [ mkVscUserSizeQuota(q) for q in client.quota.user. storage[storage_name].modified[last_timestamp].get()[1] ] storage_changed_quota = [ q for q in storage_changed_quota if q.fileset.startswith('vsc') ] logging.info( "Found %d accounts that have changed quota on storage %s in the accountpage since %s", len(storage_changed_quota), storage_name, last_timestamp) (quota_ok, quota_fail) = process_users_quota( opts.options, storage_changed_quota, storage_name, client, institute) stats["%s_quota_sync" % (storage_name, )] = len(quota_ok) stats["%s_quota_sync_fail" % (storage_name, )] = len(quota_fail) stats["%s_quota_sync_fail_warning" % (storage_name, )] = STORAGE_QUOTA_LIMIT_WARNING stats["%s_quota_sync_fail_critical" % (storage_name, )] = STORAGE_QUOTA_LIMIT_CRITICAL (vos_ok, vos_fail) = ([], []) if opts.options.vo: changed_vos = client.vo.institute[institute].modified[ last_timestamp].get()[1] changed_vo_quota = client.quota.vo.modified[last_timestamp].get( )[1] vos = sorted( set([v['vsc_id'] for v in changed_vos] + [v['virtual_organisation'] for v in changed_vo_quota])) logging.info( "Found %d %s VOs that have changed in the accountpage since %s" % (len(changed_vos), institute, last_timestamp)) logging.info( "Found %d %s VOs that have changed quota in the accountpage since %s" % (len(changed_vo_quota), institute, last_timestamp)) logging.debug("Found the following {institute} VOs: {vos}".format( institute=institute, vos=vos)) for storage_name in opts.options.storage: (vos_ok, vos_fail) = process_vos(opts.options, vos, storage_name, client, last_timestamp, institute) stats["%s_vos_sync" % (storage_name, )] = len(vos_ok) stats["%s_vos_sync_fail" % (storage_name, )] = len(vos_fail) stats["%s_vos_sync_fail_warning" % (storage_name, )] = STORAGE_VO_LIMIT_WARNING stats["%s_vos_sync_fail_critical" % (storage_name, )] = STORAGE_VO_LIMIT_CRITICAL if not (users_fail or quota_fail or vos_fail) and not opts.options.dry_run: (_, ldap_timestamp) = convert_timestamp(start_time) write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("%s users and VOs synchronised" % institute, stats)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), 'backend': ('Storage backend', None, 'store', 'gpfs'), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), } opts = ExtendedSimpleOption(options) logger = opts.log stats = {} backend = opts.options.backend try: if backend == 'gpfs': storage_backend = GpfsOperations() elif backend == 'lustre': storage_backend = LustreOperations() else: logger.exception("Backend %s not supported" % backend) filesets = storage_backend.list_filesets() quota = storage_backend.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem, )] = INODE_STORE_LOG_CRITICAL try: filename = "%s_inodes_%s_%s.gz" % ( backend, time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem, )] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem], quota[filesystem]['FILESET'], threshold=0.9, storage=backend) logger.info("Processed inodes information for filesystem %s" % (filesystem, )) if cfs: critical_filesets[filesystem] = cfs logger.info( "Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem, )] = 1 logger.exception( "Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets, )) if critical_filesets: mail_admins(critical_filesets, dry_run=opts.options.dry_run, host_institute=opts.options.host_institute) except Exception: logger.exception("Failure obtaining %s inodes" % backend) opts.critical("Failure to obtain %s inodes information" % backend) opts.epilogue("Logged %s inodes" % backend, stats)
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'information': ('the sort of information to store: user, vo, project', None, 'store', 'user'), 'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'), 'account_page_url': ('the URL at which the account page resides', None, 'store', None), 'access_token': ('the token that will allow authentication against the account page', None, 'store', None), 'target_master': ('the master used to execute showq commands', None, 'store', None), 'target_user': ('the user for ssh to the target master', None, 'store', None), } opts = ExtendedSimpleOption(options) try: rest_client = AccountpageClient(token=opts.options.access_token) gpfs = GpfsOperations() storage = VscStorage() storage_name = cluster_user_pickle_store_map[opts.options.location] login_mount_point = storage[storage_name].login_mount_point gpfs_mount_point = storage[storage_name].gpfs_mount_point clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") showq_path = opts.configfile_parser.get(host, "showq_path") clusters[host] = {'master': master, 'path': showq_path} logger.debug("clusters = %s" % (clusters, )) showq = SshShowq(opts.options.target_master, opts.options.target_user, clusters, cache_pickle=True, dry_run=opts.options.dry_run) logger.debug("Getting showq information ...") (queue_information, _, _) = showq.get_moab_command_information() timeinfo = time.time() active_users = queue_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Queue information: %s" % (queue_information)) # We need to determine which users should get an updated pickle. This depends on # - the active user set # - the information we want to provide on the cluster(set) where this script runs # At the same time, we need to determine the job information each user gets to see tup = (opts.options.information, active_users, queue_information, rest_client) (target_users, target_queue_information, user_map) = determine_target_information(*tup) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in target_users: try: path = get_pickle_path(opts.options.location, user, rest_client) user_queue_information = target_queue_information[user] user_queue_information['timeinfo'] = timeinfo store_on_gpfs(user, path, "showq", (user_queue_information, user_map[user]), gpfs, login_mount_point, gpfs_mount_point, ".showq.json.gz", opts.options.dry_run) nagios_user_count += 1 except Exception: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 stats["store_users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """ Main script. The usual. """ options = { "nagios-check-interval-threshold": NAGIOS_CHECK_INTERVAL_THRESHOLD, "access_token": ("OAuth2 token to access the account page REST API", None, "store", None), "account_page_url": ( "URL of the account page where we can find the REST API", str, "store", "https://apivsc.ugent.be/django", ), "clusters": ( "Cluster(s) (comma-separated) to sync for. " "Overrides GENT_SLURM_COMPUTE_CLUSTERS that are in production.", str, "store", None, ), } opts = ExtendedSimpleOption(options) stats = {} try: client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/") last_timestamp = "201804010000Z" # the beginning of time logging.info("Last recorded timestamp was %s" % (last_timestamp)) slurm_account_info = get_slurm_acct_info(SyncTypes.accounts) slurm_user_info = get_slurm_acct_info(SyncTypes.users) logging.debug("%d accounts found", len(slurm_account_info)) logging.debug("%d users found", len(slurm_user_info)) if opts.options.clusters is not None: clusters = opts.options.clusters.split(",") else: clusters = [ c for c in GENT_SLURM_COMPUTE_CLUSTERS if c in GENT_PRODUCTION_COMPUTE_CLUSTERS ] sacctmgr_commands = [] # make sure the institutes and the default accounts (VOs) are there for each cluster sacctmgr_commands += slurm_institute_accounts(slurm_account_info, clusters) # All users belong to a VO, so fetching the VOs is necessary/ account_page_vos = [mkVo(v) for v in client.vo.get()[1]] # The VOs do not track active state of users, so we need to fetch all accounts as well active_accounts = set( [a["vsc_id"] for a in client.account.get()[1] if a["isactive"]]) # dictionary mapping the VO vsc_id on a tuple with the VO members and the VO itself account_page_members = dict([(vo.vsc_id, (set(vo.members), vo)) for vo in account_page_vos]) # process all regular VOs sacctmgr_commands += slurm_vo_accounts(account_page_vos, slurm_account_info, clusters) # process VO members sacctmgr_commands += slurm_user_accounts(account_page_members, active_accounts, slurm_user_info, clusters, opts.options.dry_run) logging.info("Executing %d commands", len(sacctmgr_commands)) if opts.options.dry_run: print("Commands to be executed:\n") print("\n".join([" ".join(c) for c in sacctmgr_commands])) else: execute_commands(sacctmgr_commands) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL) if not opts.options.dry_run: opts.epilogue("Accounts synced to slurm", stats) else: logger.info("Dry run done")
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'), 'access_token': ('the token that will allow authentication against the account page', None, 'store', None), 'account_page_url': ('', None, 'store', None), 'target_master': ('the master used to execute showq commands', None, 'store', None), 'target_user': ('the user for ssh to the target master', None, 'store', None), } opts = ExtendedSimpleOption(options) try: rest_client = AccountpageClient(token=opts.options.access_token) gpfs = GpfsOperations() storage = VscStorage() storage_name = cluster_user_pickle_store_map[opts.options.location] login_mount_point = storage[storage_name].login_mount_point gpfs_mount_point = storage[storage_name].gpfs_mount_point clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") checkjob_path = opts.configfile_parser.get(host, "checkjob_path") clusters[host] = {'master': master, 'path': checkjob_path} checkjob = SshCheckjob(opts.options.target_master, opts.options.target_user, clusters, cache_pickle=True, dry_run=opts.options.dry_run) (job_information, _, _) = checkjob.get_moab_command_information() active_users = job_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Checkjob information: %s" % (job_information)) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in active_users: path = get_pickle_path(opts.options.location, user, rest_client) try: user_queue_information = CheckjobInfo( {user: job_information[user]}) store_on_gpfs(user, path, "checkjob", user_queue_information, gpfs, login_mount_point, gpfs_mount_point, ".checkjob.json.gz", opts.options.dry_run) nagios_user_count += 1 except Exception: logger.exception("Could not store cache file for user %s" % (user)) nagios_no_store += 1 stats["store_users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """Main script""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []), 'account_page_url': ('Base URL of the account page', None, 'store', 'https://account.vscentrum.be/django'), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), } opts = ExtendedSimpleOption(options) try: opener = urllib2.build_opener(urllib2.HTTPHandler) access_token = opts.options.access_token user_id_map = map_uids_to_names() # is this really necessary? LdapQuery(VscConfiguration()) gpfs = GpfsOperations() storage = VscStorage() target_filesystems = [storage[s].filesystem for s in opts.options.storage] filesystems = gpfs.list_filesystems(target_filesystems).keys() logger.debug("Found the following GPFS filesystems: %s" % (filesystems)) filesets = gpfs.list_filesets() logger.debug("Found the following GPFS filesets: %s" % (filesets)) quota = gpfs.list_quota() exceeding_filesets = {} exceeding_users = {} stats = {} for storage_name in opts.options.storage: logger.info("Processing quota for storage_name %s" % (storage_name)) filesystem = storage[storage_name].filesystem if filesystem not in filesystems: logger.error("Non-existant filesystem %s" % (filesystem)) continue if filesystem not in quota.keys(): logger.error("No quota defined for storage_name %s [%s]" % (storage_name, filesystem)) continue quota_storage_map = get_mmrepquota_maps(quota[filesystem], storage_name, filesystem, filesets) exceeding_filesets[storage_name] = process_fileset_quota(storage, gpfs, storage_name, filesystem, quota_storage_map['FILESET'], opener, opts.options.account_page_url, access_token, opts.options.dry_run) exceeding_users[storage_name] = process_user_quota(storage, gpfs, storage_name, filesystem, quota_storage_map['USR'], user_id_map, opener, opts.options.account_page_url, access_token, opts.options.dry_run) stats["%s_fileset_critical" % (storage_name,)] = QUOTA_FILESETS_CRITICAL if exceeding_filesets[storage_name]: stats["%s_fileset" % (storage_name,)] = 1 logger.warning("storage_name %s found %d filesets that are exceeding their quota" % (storage_name, len(exceeding_filesets))) for (e_fileset, e_quota) in exceeding_filesets[storage_name]: logger.warning("%s has quota %s" % (e_fileset, str(e_quota))) else: stats["%s_fileset" % (storage_name,)] = 0 logger.debug("storage_name %s found no filesets that are exceeding their quota" % storage_name) notify_exceeding_filesets(gpfs=gpfs, storage=storage_name, filesystem=filesystem, exceeding_items=exceeding_filesets[storage_name], dry_run=opts.options.dry_run) stats["%s_users_warning" % (storage_name,)] = QUOTA_USERS_WARNING stats["%s_users_critical" % (storage_name,)] = QUOTA_USERS_CRITICAL if exceeding_users[storage_name]: stats["%s_users" % (storage_name,)] = len(exceeding_users[storage_name]) logger.warning("storage_name %s found %d users who are exceeding their quota" % (storage_name, len(exceeding_users[storage_name]))) for (e_user_id, e_quota) in exceeding_users[storage_name]: logger.warning("%s has quota %s" % (e_user_id, str(e_quota))) else: stats["%s_users" % (storage_name,)] = 0 logger.debug("storage_name %s found no users who are exceeding their quota" % storage_name) notify_exceeding_users(gpfs=gpfs, storage=storage_name, filesystem=filesystem, exceeding_items=exceeding_users[storage_name], dry_run=opts.options.dry_run) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) logger = opts.log stats = {} try: gpfs = GpfsOperations() filesets = gpfs.list_filesets() quota = gpfs.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem, )] = INODE_STORE_LOG_CRITICAL try: filename = "gpfs_inodes_%s_%s.gz" % ( time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem, )] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem], quota[filesystem]['FILESET'], threshold=0.9) logger.info("Processed inodes information for filesystem %s" % (filesystem, )) if cfs: critical_filesets[filesystem] = cfs logger.info( "Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem, )] = 1 logger.exception( "Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets, )) if critical_filesets: mail_admins(critical_filesets, opts.options.dry_run) except Exception: logger.exception("Failure obtaining GPFS inodes") opts.critical("Failure to obtain GPFS inodes information") opts.epilogue("Logged GPFS inodes", stats)
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'information': ('the sort of information to store: user, vo, project', None, 'store', 'user'), 'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'), 'account_page_url': ('the URL at which the account page resides', None, 'store', None), 'access_token': ('the token that will allow authentication against the account page', None, 'store', None), 'target_master': ('the master used to execute showq commands', None, 'store', None), 'target_user': ('the user for ssh to the target master', None, 'store', None), } opts = ExtendedSimpleOption(options) try: rest_client = AccountpageClient(token=opts.options.access_token) gpfs = GpfsOperations() storage = VscStorage() storage_name = cluster_user_pickle_store_map[opts.options.location] login_mount_point = storage[storage_name].login_mount_point gpfs_mount_point = storage[storage_name].gpfs_mount_point clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") showq_path = opts.configfile_parser.get(host, "showq_path") clusters[host] = { 'master': master, 'path': showq_path } logger.debug("clusters = %s" % (clusters,)) showq = MasterSshShowq(opts.options.target_master, opts.options.target_user, clusters, cache_pickle=True, dry_run=opts.options.dry_run) logger.debug("Getting showq information ...") (queue_information, reported_hosts, failed_hosts) = showq.get_moab_command_information() timeinfo = time.time() active_users = queue_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Queue information: %s" % (queue_information)) # We need to determine which users should get an updated pickle. This depends on # - the active user set # - the information we want to provide on the cluster(set) where this script runs # At the same time, we need to determine the job information each user gets to see tup = (opts.options.information, active_users, queue_information, rest_client) (target_users, target_queue_information, user_map) = determine_target_information(*tup) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in target_users: try: path = get_pickle_path(opts.options.location, user, rest_client) user_queue_information = target_queue_information[user] user_queue_information['timeinfo'] = timeinfo store_on_gpfs(user, path, "showq", (user_queue_information, user_map[user]), gpfs, login_mount_point, gpfs_mount_point, ".showq.json.gz", opts.options.dry_run) nagios_user_count += 1 except Exception: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 stats["store_users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """Main script""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []), 'write-cache': ('Write the data into the cache files in the FS', None, 'store_true', False), 'account_page_url': ('Base URL of the account page', None, 'store', 'https://account.vscentrum.be/django'), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), } opts = ExtendedSimpleOption(options) logger = opts.log try: client = AccountpageClient(token=opts.options.access_token) user_id_map = map_uids_to_names() # is this really necessary? gpfs = GpfsOperations() storage = VscStorage() target_filesystems = [ storage[s].filesystem for s in opts.options.storage ] filesystems = gpfs.list_filesystems(device=target_filesystems).keys() logger.debug("Found the following GPFS filesystems: %s" % (filesystems)) filesets = gpfs.list_filesets(devices=target_filesystems) logger.debug("Found the following GPFS filesets: %s" % (filesets)) quota = gpfs.list_quota(devices=target_filesystems) exceeding_filesets = {} exceeding_users = {} stats = {} for storage_name in opts.options.storage: logger.info("Processing quota for storage_name %s" % (storage_name)) filesystem = storage[storage_name].filesystem replication_factor = storage[storage_name].data_replication_factor if filesystem not in filesystems: logger.error("Non-existent filesystem %s" % (filesystem)) continue if filesystem not in quota.keys(): logger.error("No quota defined for storage_name %s [%s]" % (storage_name, filesystem)) continue quota_storage_map = get_mmrepquota_maps( quota[filesystem], storage_name, filesystem, filesets, replication_factor, ) exceeding_filesets[storage_name] = process_fileset_quota( storage, gpfs, storage_name, filesystem, quota_storage_map['FILESET'], client, dry_run=opts.options.dry_run, institute=opts.options.host_institute) exceeding_users[storage_name] = process_user_quota( storage, gpfs, storage_name, None, quota_storage_map['USR'], user_id_map, client, dry_run=opts.options.dry_run, institute=opts.options.host_institute) stats["%s_fileset_critical" % (storage_name, )] = QUOTA_FILESETS_CRITICAL if exceeding_filesets[storage_name]: stats["%s_fileset" % (storage_name, )] = 1 logger.warning( "storage_name %s found %d filesets that are exceeding their quota", storage_name, len(exceeding_filesets)) for (e_fileset, e_quota) in exceeding_filesets[storage_name]: logger.warning("%s has quota %s" % (e_fileset, str(e_quota))) else: stats["%s_fileset" % (storage_name, )] = 0 logger.debug( "storage_name %s found no filesets that are exceeding their quota" % storage_name) stats["%s_users_warning" % (storage_name, )] = QUOTA_USERS_WARNING stats["%s_users_critical" % (storage_name, )] = QUOTA_USERS_CRITICAL if exceeding_users[storage_name]: stats["%s_users" % (storage_name, )] = len( exceeding_users[storage_name]) logger.warning( "storage_name %s found %d users who are exceeding their quota" % (storage_name, len(exceeding_users[storage_name]))) for (e_user_id, e_quota) in exceeding_users[storage_name]: logger.warning("%s has quota %s" % (e_user_id, str(e_quota))) else: stats["%s_users" % (storage_name, )] = 0 logger.debug( "storage_name %s found no users who are exceeding their quota" % storage_name) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") opts.epilogue("quota check completed", stats)
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'location': ('the location for storing the pickle file: home, scratch', str, 'store', 'home'), 'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'), 'access_token': ('the token that will allow authentication against the account page', None, 'store', None), 'account_page_url': ('',None, 'store', None), 'target_master': ('the master used to execute showq commands', None, 'store', None), 'target_user': ('the user for ssh to the target master', None, 'store', None), } opts = ExtendedSimpleOption(options) try: rest_client = AccountpageClient(token=opts.options.access_token) gpfs = GpfsOperations() storage = VscStorage() storage_name = cluster_user_pickle_store_map[opts.options.location] login_mount_point = storage[storage_name].login_mount_point gpfs_mount_point = storage[storage_name].gpfs_mount_point clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") checkjob_path = opts.configfile_parser.get(host, "checkjob_path") clusters[host] = { 'master': master, 'path': checkjob_path } checkjob = MasterSshCheckjob( opts.options.target_master, opts.options.target_user, clusters, cache_pickle=True, dry_run=opts.options.dry_run) (job_information, reported_hosts, failed_hosts) = checkjob.get_moab_command_information() active_users = job_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Checkjob information: %s" % (job_information)) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in active_users: path = get_pickle_path(opts.options.location, user, rest_client) try: user_queue_information = CheckjobInfo({user: job_information[user]}) store_on_gpfs(user, path, "checkjob", user_queue_information, gpfs, login_mount_point, gpfs_mount_point, ".checkjob.json.gz", opts.options.dry_run) nagios_user_count += 1 except Exception: logger.exception("Could not store cache file for user %s" % (user)) nagios_no_store += 1 stats["store_users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """The script""" options = { "detailed": ("Report detailed information", None, "store_true", False, "D"), "moabxml": ("Use xml moab data from file (for testing)", None, "store", None), "max-retries": ("Maximum number retries prior to going critical", "int", "store", 2), "retry-interval": ("Seconds in between retries", "int", "store", 60), } opts = ExtendedSimpleOption(options) msg = "show_stats completed (%d tries)" try: if opts.options.moabxml: try: moabxml = open(opts.options.moabxml).read() except: logger.raiseException("Failed to read moab xml from %s" % opts.options.moabxml) else: moabxml = None for retry in xrange(0, opts.options.max_retries): moab_stats = showstats(xml=moabxml) if moab_stats: break else: logger.info("Sleeping after retry %d" % (retry + 1,)) time.sleep(opts.options.retry_interval) if not moab_stats: logger.error("Moabs showstats dit not provide useful output after %d, likely timed out." % (retry + 1,)) opts.critical("Moabs showstats failed running correctly (%d retries)" % (retry + 1,)) sys.exit(NAGIOS_EXIT_CRITICAL) else: stats = moab_stats["summary"] if opts.options.detailed: detailed_info_string = """Shortterm/Longterm efficiency %.3f/%.3f Dedicate/total prochours %s/%s Active/Total procs %s/%s""" % ( stats["STE"], stats["LTE"], stats["DPH"], stats["TPH"], stats["CAP"], stats["CTP"], ) logger.info( "detailed result STE = %s LTE = %s DPH = %s TPH = %s CAP = %s CTP = %s" % (stats["STE"], stats["LTE"], stats["DPH"], stats["TPH"], stats["CAP"], stats["CTP"]) ) print detailed_info_string info_string = "short %.3f long %.3f" % (stats["STE"], stats["LTE"]) logger.info("result: %s" % (info_string,)) msg = msg % (retry + 1,) msg += " %s" % (info_string,) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)