def get_mpi_and_sched_and_options(): """ Selects mpi flavor and scheduler based on environment and arguments. @return: A triplet containing the chosen mpi flavor, chosen scheduler and the MympirunOption class. """ scriptname = os.path.basename(os.path.abspath(sys.argv[0])) # if the scriptname is 'mpirun', its means that mympirun was called through the faked mpirun path isfake = scriptname == 'mpirun' # init generaloption with the various mpirun cli options optionparser = MympirunOption(ismpirun=isfake) # see if an mpi flavor was explicitly chosen as a command line argument. If not, just use the mpirun that was called # We are using sys.argv because generaloption depends on the the returned scriptname if optionparser.options.setmpi: setmpi = optionparser.options.setmpi optionparser.log.debug("mympirun has been forced to use %s as MPI flavor", setmpi) else: setmpi = sys.argv[0] optionparser.log.debug("mympirun will be executed by %s", setmpi) scriptname, mpi, found_mpi = mpim.what_mpi(setmpi) found_mpi_names = [x.__name__ for x in found_mpi] if optionparser.options.showmpi: fancylogger.setLogLevelInfo() optionparser.log.info("Found MPI classes %s", ", ".join(found_mpi_names)) return # Select a Scheduler from the available schedulers sched, found_sched = schedm.what_sched(getattr(optionparser.options, 'setsched', None)) found_sched_names = [x.__name__ for x in found_sched] if optionparser.options.showsched: fancylogger.setLogLevelInfo() optionparser.log.info("Found Sched classes %s", ", ".join(found_sched_names)) return if mpi is None: optionparser.log.raiseException(("No MPI class found that supports scriptname %s; isfake %s). Please use " "mympirun through one of the direct calls or make sure the mpirun command can" " be found. Found MPI %s") % (scriptname, isfake, ", ".join(found_mpi_names))) else: optionparser.log.debug("Found MPI class %s (scriptname %s; isfake %s)", mpi.__name__, scriptname, isfake) if sched is None: optionparser.log.raiseException("No sched class found (options.setsched %s ; found Sched classes %s)", optionparser.options.setsched, ", ".join(found_sched_names)) else: optionparser.log.debug("Found sched class %s from options.setsched %s (all Sched found %s)", sched.__name__, optionparser.options.setsched, ", ".join(found_sched_names)) if not optionparser.args: optionparser.log.warn("no mpi script provided") return return mpi, sched, optionparser
def get_mpi_and_sched_and_options(): """Parses the mpi and scheduler based on current environment and guesses the best one to use""" scriptname, mpi, found_mpi = whatMPI(sys.argv[0]) ismpirun = scriptname == 'mpirun' mo = MympirunOption(ismpirun=ismpirun) if mo.args is None or len(mo.args) == 0: mo.parser.print_shorthelp() raise ExitException("Exit no args provided") sched, found_sched = whatSched(getattr(mo.options, 'schedtype', None)) found_mpi_names = [x.__name__ for x in found_mpi] found_sched_names = [x.__name__ for x in found_sched] if mo.options.showmpi: fancylogger.setLogLevelInfo() _logger.info("Found MPI classes %s" % (", ".join(found_mpi_names))) raise ExitException("Exit from showmpi") if mo.options.showsched: fancylogger.setLogLevelInfo() _logger.info("Found Sched classes %s" % (", ".join(found_sched_names))) raise ExitException("Exit from showsched") if mpi is None: mo.parser.print_shorthelp() mo.log.raiseException(( "No MPI class found (scriptname %s; ismpirun %s). Please use mympirun through one " "of the direct calls or make sure the mpirun command can be found. " "Found MPI %s") % (scriptname, ismpirun, ", ".join(found_mpi_names))) else: mo.log.debug("Found MPI class %s (scriptname %s; ismpirun %s)" % (mpi.__name__, scriptname, ismpirun)) if sched is None: mo.log.raiseException( "No sched class found (options.schedtype %s ; found Sched classes %s)" % (mo.options.schedtype, ", ".join(found_sched_names))) else: mo.log.debug( "Found sched class %s from options.schedtype %s (all Sched found %s)" % (sched.__name__, mo.options.schedtype, ", ".join(found_sched_names))) return mpi, sched, mo
def getInstance(): """Make an instance of the relevant MPI class. Also set the RM instance""" scriptname, mpi, found_mpi = whatMPI(sys.argv[0]) ismpirun = scriptname == 'mpirun' mo = MympirunOption(ismpirun=ismpirun) if mo.args is None or len(mo.args) == 0: mo.parser.print_shorthelp() raise ExitException("Exit no args provided") sched, found_sched = whatSched(getattr(mo.options, 'schedtype', None)) found_mpi_names = [x.__name__ for x in found_mpi] found_sched_names = [x.__name__ for x in found_sched] if mo.options.showmpi: setLogLevelInfo() _logger.info("Found MPI classes %s" % (", ".join(found_mpi_names))) raise ExitException("Exit from showmpi") if mo.options.showsched: setLogLevelInfo() _logger.info("Found Sched classes %s" % (", ".join(found_sched_names))) raise ExitException("Exit from showsched") if mpi is None: mo.parser.print_shorthelp() mo.log.raiseException(("No MPI class found (scriptname %s; ismpirun %s). Please use mympirun through one " "of the direct calls or make sure the mpirun command can be found. " "Found MPI %s") % (scriptname, ismpirun, ", ".join(found_mpi_names))) else: mo.log.debug("Found MPI class %s (scriptname %s; ismpirun %s)" % (mpi.__name__, scriptname, ismpirun)) if sched is None: mo.log.raiseException("No sched class found (options.schedtype %s ; found Sched classes %s)" % (mo.options.schedtype, ", ".join(found_sched_names))) else: mo.log.debug("Found sched class %s from options.schedtype %s (all Sched found %s)" % (sched.__name__, mo.options.schedtype, ", ".join(found_sched_names))) class M(mpi, sched): """Temporary class to couple MPI and local sched""" def __init__(self, **kwargs): self.log = getLogger("%s_%s" % (mpi.__name__, sched.__name__)) super(M, self).__init__(**kwargs) return M(options=mo.options, cmdargs=mo.args)
def get_mpi_and_sched_and_options(): """Parses the mpi and scheduler based on current environment and guesses the best one to use""" scriptname, mpi, found_mpi = whatMPI(sys.argv[0]) ismpirun = scriptname == 'mpirun' mo = MympirunOption(ismpirun=ismpirun) if mo.args is None or len(mo.args) == 0: mo.parser.print_shorthelp() raise ExitException("Exit no args provided") sched, found_sched = whatSched(getattr(mo.options, 'schedtype', None)) found_mpi_names = [x.__name__ for x in found_mpi] found_sched_names = [x.__name__ for x in found_sched] if mo.options.showmpi: fancylogger.setLogLevelInfo() _logger.info("Found MPI classes %s" % (", ".join(found_mpi_names))) raise ExitException("Exit from showmpi") if mo.options.showsched: fancylogger.setLogLevelInfo() _logger.info("Found Sched classes %s" % (", ".join(found_sched_names))) raise ExitException("Exit from showsched") if mpi is None: mo.parser.print_shorthelp() mo.log.raiseException(("No MPI class found (scriptname %s; ismpirun %s). Please use mympirun through one " "of the direct calls or make sure the mpirun command can be found. " "Found MPI %s") % (scriptname, ismpirun, ", ".join(found_mpi_names))) else: mo.log.debug("Found MPI class %s (scriptname %s; ismpirun %s)" % (mpi.__name__, scriptname, ismpirun)) if sched is None: mo.log.raiseException("No sched class found (options.schedtype %s ; found Sched classes %s)" % (mo.options.schedtype, ", ".join(found_sched_names))) else: mo.log.debug("Found sched class %s from options.schedtype %s (all Sched found %s)" % (sched.__name__, mo.options.schedtype, ", ".join(found_sched_names))) return mpi, sched, mo
def _stream_stdouterr(self, isstdout=True, expect_match=True): """Log to stdout or stderror, check stdout or stderror""" fd, logfn = tempfile.mkstemp() # fh will be checked fh = os.fdopen(fd, 'w') _stdout = sys.stdout _stderr = sys.stderr if isstdout == expect_match: sys.stdout = fh sys.stderr = open(os.devnull, 'w') else: sys.stdout = open(os.devnull, 'w') sys.stderr = fh fancylogger.setLogLevelInfo() name = 'test_stream_stdout' lh = fancylogger.logToScreen(stdout=isstdout) logger = fancylogger.getLogger(name, fname=True, clsname=False) # logfn makes it unique msg = 'TEST isstdout %s expect_match %s logfn %s' % ( isstdout, expect_match, logfn) logger.info(msg) # restore fancylogger.logToScreen(enable=False, handler=lh) sys.stdout = _stdout sys.stderr = _stderr fh2 = open(logfn) txt = fh2.read().strip() fh2.close() reg_exp = re.compile(r"INFO\s+\S+.%s.%s\s+\S+\s+%s" % (name, '_stream_stdouterr', msg)) match = reg_exp.search(txt) is not None self.assertEqual(match, expect_match) try: os.remove(logfn) except: pass
def _stream_stdouterr(self, isstdout=True, expect_match=True): """Log to stdout or stderror, check stdout or stderror""" fd, logfn = tempfile.mkstemp() # fh will be checked fh = os.fdopen(fd, 'w') _stdout = sys.stdout _stderr = sys.stderr if isstdout == expect_match: sys.stdout = fh sys.stderr = open(os.devnull, 'w') else: sys.stdout = open(os.devnull, 'w') sys.stderr = fh fancylogger.setLogLevelInfo() name = 'test_stream_stdout' lh = fancylogger.logToScreen(stdout=isstdout) logger = fancylogger.getLogger(name, fname=True, clsname=False) # logfn makes it unique msg = 'TEST isstdout %s expect_match %s logfn %s' % (isstdout, expect_match, logfn) logger.info(msg) # restore fancylogger.logToScreen(enable=False, handler=lh) sys.stdout = _stdout sys.stderr = _stderr fh2 = open(logfn) txt = fh2.read().strip() fh2.close() reg_exp = re.compile(r"INFO\s+\S+.%s.%s\s+\S+\s+%s" % (name, '_stream_stdouterr', msg)) match = reg_exp.search(txt) is not None self.assertEqual(match, expect_match) try: os.remove(logfn) except: pass
def main(): """the main function""" fancylogger.logToScreen(enable=True, stdout=True) fancylogger.setLogLevelInfo() options = { 'github-user': ('Your github username to use', None, 'store', None, 'g'), 'closed-pr': ('Delete all gists from closed pull-requests', None, 'store_true', True, 'p'), 'all': ('Delete all gists from Easybuild ', None, 'store_true', False, 'a'), 'orphans': ('Delete all gists without a pull-request', None, 'store_true', False, 'o'), } go = simple_option(options) log = go.log if not (go.options.all or go.options.closed_pr or go.options.orphans): raise EasyBuildError("Please tell me what to do?") if go.options.github_user is None: eb_go = EasyBuildOptions(envvar_prefix='EASYBUILD', go_args=[]) username = eb_go.options.github_user log.debug("Fetch github username from easybuild, found: %s", username) else: username = go.options.github_user if username is None: raise EasyBuildError("Could not find a github username") else: log.info("Using username = %s", username) token = fetch_github_token(username) gh = RestClient(GITHUB_API_URL, username=username, token=token) # ToDo: add support for pagination status, gists = gh.gists.get(per_page=100) if status != HTTP_STATUS_OK: raise EasyBuildError("Failed to get a lists of gists for user %s: error code %s, message = %s", username, status, gists) else: log.info("Found %s gists", len(gists)) regex = re.compile(r"(EasyBuild test report|EasyBuild log for failed build).*?(?:PR #(?P<PR>[0-9]+))?\)?$") pr_cache = {} num_deleted = 0 for gist in gists: if not gist["description"]: continue re_pr_num = regex.search(gist["description"]) delete_gist = False if re_pr_num: log.debug("Found a Easybuild gist (id=%s)", gist["id"]) pr_num = re_pr_num.group("PR") if go.options.all: delete_gist = True elif pr_num and go.options.closed_pr: log.debug("Found Easybuild test report for PR #%s", pr_num) if pr_num not in pr_cache: status, pr = gh.repos[GITHUB_EB_MAIN][GITHUB_EASYCONFIGS_REPO].pulls[pr_num].get() if status != HTTP_STATUS_OK: raise EasyBuildError("Failed to get pull-request #%s: error code %s, message = %s", pr_num, status, pr) pr_cache[pr_num] = pr["state"] if pr_cache[pr_num] == "closed": log.debug("Found report from closed PR #%s (id=%s)", pr_num, gist["id"]) delete_gist = True elif not pr_num and go.options.orphans: log.debug("Found Easybuild test report without PR (id=%s)", gist["id"]) delete_gist = True if delete_gist: status, del_gist = gh.gists[gist["id"]].delete() if status != HTTP_DELETE_OK: raise EasyBuildError("Unable to remove gist (id=%s): error code %s, message = %s", gist["id"], status, del_gist) else: log.info("Delete gist with id=%s", gist["id"]) num_deleted += 1 log.info("Deleted %s gists", num_deleted)
dest="path", help="Specify a path inside the repo (default easybuild/easyconfigs).") parser.add_option("-l", "--local", action="store_true", dest="local", help="Use a local path, not on github.com (Default false)") options, args = parser.parse_args() # get and configure logger log = fancylogger.getLogger(__name__) if options.verbose == 1: fancylogger.setLogLevelWarning() elif options.verbose == 2: fancylogger.setLogLevelInfo() elif options.verbose >= 3: fancylogger.setLogLevelDebug() if options.quiet: fancylogger.logToScreen(False) else: fancylogger.logToScreen(True) # other options if not options.branch: options.branch = "develop" if not options.username: options.username = "******" if not options.repo: options.repo = "easybuild-easyconfigs"
from vsc.administration.user import cluster_user_pickle_location_map, cluster_user_pickle_store_map from vsc.accountpage.client import AccountpageClient from vsc.config.base import VscStorage from vsc.filesystem.gpfs import GpfsOperations from vsc.jobs.moab.checkjob import SshCheckjob, CheckjobInfo from vsc.utils import fancylogger from vsc.utils.fs_store import store_on_gpfs from vsc.utils.nagios import NAGIOS_EXIT_CRITICAL from vsc.utils.script_tools import ExtendedSimpleOption #Constants NAGIOS_CHECK_INTERVAL_THRESHOLD = 30 * 60 # 30 minutes logger = fancylogger.getLogger(__name__) fancylogger.logToScreen(True) fancylogger.setLogLevelInfo() STORE_LIMIT_CRITICAL = 5 # FIXME: common def get_pickle_path(location, user_id, rest_client): """Determine the path (directory) where the pickle file qith the queue information should be stored. @type location: string @type user_id: string @param location: indication of the user accesible storage spot to use, e.g., home or scratch @param user_id: VSC user ID @param rest_client: VscAccountpageClient instance @returns: tuple of (string representing the directory where the pickle file should be stored,
next_idx = (idx + 1) % len(recvbuf) if recvbuf[idx]['hostname'] == recvbuf[next_idx]['hostname']: if not recvbuf[idx]['affinity'][-1] == recvbuf[next_idx]['affinity'][0] - 1: log.error("No nn on same node for rank %s (aff %s) and next rank %s (aff %s)" % (idx, recvbuf[idx]['affinity'], next_idx, recvbuf[next_idx]['affinity'])) else: if not recvbuf[next_idx]['affinity'][0] == 0: log.error("No nn on different nodes for rank %s (hn %s aff %s) and next rank %s (hn %s aff %s)" % (idx, recvbuf[idx]['hostname'], recvbuf[idx]['affinity'], next_idx, recvbuf[next_idx]['hostname'], recvbuf[next_idx]['affinity']) ) if __name__ == '__main__': log = getLogger('mympisanity') setLogLevelInfo() if MPI4PY_EXCEPTION: log.error("No mpi4py found: %s", MPI4PY_EXCEPTION) sys.exit(1) log.info("mympisanity started") comm = MPI.COMM_WORLD # gather the info from all processes recvbuf = comm.gather(Report(), 0) log.info("mympisanity gather report finished")
def main(): """the main function""" fancylogger.logToScreen(enable=True, stdout=True) fancylogger.setLogLevelInfo() options = { 'github-user': ('Your github username to use', None, 'store', None, 'g'), 'closed-pr': ('Delete all gists from closed pull-requests', None, 'store_true', True, 'p'), 'all': ('Delete all gists from Easybuild ', None, 'store_true', False, 'a'), 'orphans': ('Delete all gists without a pull-request', None, 'store_true', False, 'o'), } go = simple_option(options) log = go.log if not (go.options.all or go.options.closed_pr or go.options.orphans): log.error("Please tell me what to do?") if go.options.github_user is None: eb_go = EasyBuildOptions(envvar_prefix='EASYBUILD', go_args=[]) username = eb_go.options.github_user log.debug("Fetch github username from easybuild, found: %s", username) else: username = go.options.github_user if username is None: log.error("Could not find a github username") else: log.info("Using username = %s", username) token = fetch_github_token(username) gh = RestClient(GITHUB_API_URL, username=username, token=token) # ToDo: add support for pagination status, gists = gh.gists.get(per_page=100) if status != HTTP_STATUS_OK: log.error("Failed to get a lists of gists for user %s: error code %s, message = %s", username, status, gists) else: log.info("Found %s gists", len(gists)) regex = re.compile(r"(EasyBuild test report|EasyBuild log for failed build).*?(?:PR #(?P<PR>[0-9]+))?\)?$") pr_cache = {} num_deleted = 0 for gist in gists: if not gist["description"]: continue re_pr_num = regex.search(gist["description"]) delete_gist = False if re_pr_num: log.debug("Found a Easybuild gist (id=%s)", gist["id"]) pr_num = re_pr_num.group("PR") if go.options.all: delete_gist = True elif pr_num and go.options.closed_pr: log.debug("Found Easybuild test report for PR #%s", pr_num) if pr_num not in pr_cache: status, pr = gh.repos[GITHUB_EB_MAIN][GITHUB_EASYCONFIGS_REPO].pulls[pr_num].get() if status != HTTP_STATUS_OK: log.error("Failed to get pull-request #%s: error code %s, message = %s", pr_num, status, pr) pr_cache[pr_num] = pr["state"] if pr_cache[pr_num] == "closed": log.debug("Found report from closed PR #%s (id=%s)", pr_num, gist["id"]) delete_gist = True elif not pr_num and go.options.orphans: log.debug("Found Easybuild test report without PR (id=%s)", gist["id"]) delete_gist = True if delete_gist: status, del_gist = gh.gists[gist["id"]].delete() if status != HTTP_DELETE_OK: log.error("Unable to remove gist (id=%s): error code %s, message = %s", gist["id"], status, del_gist) else: log.info("Delete gist with id=%s", gist["id"]) num_deleted += 1 log.info("Deleted %s gists", num_deleted)
def main(): """Main""" options = { 'nagios': ('Report in nagios format', None, 'store_true', False, 'n'), 'regex': ('Filter on regexp, data for first match', None, 'regex', None, 'r'), 'allregex': ('Combined with --regex/-r, return all data', None, 'store_true', False, 'A'), 'anystate': ('Matches any state (eg down_on_error node will also list as error)', None, 'store_true', False, 'a'), 'down': ('Down nodes', None, 'store_true', False, 'D'), 'downonerror': ('Down on error nodes', None, 'store_true', False, 'E'), 'offline': ('Offline nodes', None, 'store_true', False, 'o'), 'partial': ('Partial nodes (one or more running job(s), jobslot(s) available)', None, 'store_true', False, 'p'), 'job-exclusive': ('Job-exclusive nodes (no jobslots available)', None, 'store_true', False, 'x'), 'free': ('Free nodes (0 or more running jobs, jobslot(s) available)', None, 'store_true', False, 'f'), 'unknown': ('State unknown nodes', None, 'store_true', False, 'u'), 'bad': ('Bad nodes (broken jobregex)', None, 'store_true', False, 'b'), 'error': ('Error nodes', None, 'store_true', False, 'e'), 'idle': ('Idle nodes (No running jobs, jobslot(s) available)', None, 'store_true', False, 'i'), 'singlenodeinfo': (('Single (most-frequent) node information in key=value format' '(no combination with other options)'), None, 'store_true', False, 'I'), 'reportnodeinfo': ('Report node information (no combination with other options)', None, 'store_true', False, 'R'), 'moab': ('Use moab information (mdiag -n)', None, 'store_true', False, 'm'), 'moabxml': ('Use xml moab data from file (for testing)', None, 'store', None), 'shorthost': ('Return (short) hostname', None, 'store_true', False, 's'), 'invert': ('Return inverted selection', None, 'store_true', False, 'v'), } go = simple_option(options) if go.options.nagios and not go.options.debug: fancylogger.logToDevLog(enable=True) fancylogger.logToScreen(enable=False) fancylogger.setLogLevelInfo() all_states = ND_NAGIOS_CRITICAL + ND_NAGIOS_WARNING + ND_NAGIOS_OK report_states = [] if go.options.down: report_states.append(ND_down) if go.options.downonerror: report_states.append(ND_down_on_error) if go.options.offline: report_states.append(ND_offline) if go.options.free: report_states.append(ND_free) if go.options.partial: report_states.append(ND_free_and_job) if go.options.job_exclusive: report_states.append(ND_job_exclusive) if go.options.unknown: report_states.append(ND_state_unknown) if go.options.bad: report_states.append(ND_bad) if go.options.error: report_states.append(ND_error) if go.options.idle: report_states.append(ND_idle) if len(report_states) == 0: report_states = all_states if go.options.singlenodeinfo or go.options.reportnodeinfo: nodeinfo = collect_nodeinfo()[2] if len(nodeinfo) == 0: _log.error('No nodeinfo found') sys.exit(1) ordered = sorted(nodeinfo.items(), key=lambda x: len(x[1]), reverse=True) if go.options.singlenodeinfo: if len(nodeinfo) > 1: msg = "Not all nodes have same parameters. Using most frequent ones." if go.options.reportnodeinfo: _log.warning(msg) else: _log.error(msg) # usage: export `./show_nodes -I` ; env |grep SHOWNODES_ most_freq = ordered[0][0] msg = [] msg.append("SHOWNODES_PPN=%d" % most_freq[0]) msg.append("SHOWNODES_PHYSMEMMB=%d" % (most_freq[1] * 1024)) else: msg = [] for info, nodes in ordered: txt = "%d nodes with %d cores, %s MB physmem, %s GB swap and %s GB local disk" % ( len(nodes), info[0], info[1] * 1024, info[2], info[3]) msg.append(txt) # print and _log are dumped to stdout at different moment, repeat the txt in the debug log _log.debug("Found %s with matching nodes: %s" % (txt, nodes)) print "\n".join(msg) sys.exit(0) if go.options.moab: if go.options.moabxml: try: moabxml = open(go.options.moabxml).read() except (OSError, IOError): _log.error('Failed to read moab xml from %s' % go.options.moabxml) else: moabxml = None nodes_dict = moab_get_nodes_dict(xml=moabxml) nodes = get_nodes(nodes_dict) else: nodes = get_nodes() nagiosexit = { NDNAG_WARNING: warning_exit, NDNAG_CRITICAL: critical_exit, NDNAG_OK: ok_exit, } nagios_res = {} detailed_res = {} nodes_found = [] all_nodes = [] for name, full_state in nodes: all_nodes.append(name) if go.options.regex and not go.options.regex.search(name): continue nagios_state = full_state['derived']['nagiosstate'] if nagios_state not in nagios_res: nagios_res[nagios_state] = [] state = full_state['derived']['state'] states = full_state['derived']['states'] if state == ND_free and ND_idle in states: state = ND_idle # special case for idle if state not in detailed_res: detailed_res[state] = [] if go.options.anystate: states_to_check = states else: states_to_check = [state] # filter the allowed states if any(x for x in states_to_check if x in report_states): nagios_res[nagios_state].append(states) detailed_res[state].append(states) nodes_found.append(name) if go.options.regex and not go.options.allregex: break if go.options.invert: nodes_found = [x for x in all_nodes if x not in nodes_found] if go.options.regex and not go.options.allregex: # there should only be one node nagios_state, all_states = nagios_res.items()[0] states = all_states[0] if go.options.nagios: msg = "show_nodes - %s" % ",".join(states) nagiosexit[nagios_state](msg) else: txt = "%s %s" % (nagios_state, ",".join(states)) print txt else: if go.options.nagios: msg = NagiosResult('show_nodes') txt = [] total = 0 for state in all_states: if state in detailed_res: nr = len(detailed_res[state]) else: nr = 0 total += nr setattr(msg, state, nr) msg.total = total reported_state = [str(NDNAG_OK), ''] if ND_bad in detailed_res: reported_state[0] = NDNAG_CRITICAL msg.message += ' - %s bad nodes' % (len(detailed_res[ND_bad])) nagiosexit[reported_state[0]](msg) else: # just print the nodes if go.options.shorthost: nodes_found = [x.split('.')[0] for x in nodes_found] print ' '.join(nodes_found)
def main(): """Main""" options = { 'nagios': ('Report in nagios format', None, 'store_true', False, 'n'), 'regex': ('Filter on regexp, data for first match', None, 'regex', None, 'r'), 'allregex': ('Combined with --regex/-r, return all data', None, 'store_true', False, 'A'), 'anystate': ('Matches any state (eg down_on_error node will also list as error)', None, 'store_true', False, 'a'), 'down': ('Down nodes', None, 'store_true', False, 'D'), 'downonerror': ('Down on error nodes', None, 'store_true', False, 'E'), 'offline': ('Offline nodes', None, 'store_true', False, 'o'), 'offline_idle': ('Offline idle nodes', None, 'store_true', False, 'O'), 'partial': ('Partial nodes (one or more running job(s), jobslot(s) available)', None, 'store_true', False, 'p'), 'job-exclusive': ('Job-exclusive nodes (no jobslots available)', None, 'store_true', False, 'x'), 'free': ('Free nodes (0 or more running jobs, jobslot(s) available)', None, 'store_true', False, 'f'), 'unknown': ('State unknown nodes', None, 'store_true', False, 'u'), 'bad': ('Bad nodes (broken jobregex)', None, 'store_true', False, 'b'), 'error': ('Error nodes', None, 'store_true', False, 'e'), 'idle': ('Idle nodes (No running jobs, jobslot(s) available)', None, 'store_true', False, 'i'), 'singlenodeinfo': (('Single (most-frequent) node information in key=value format' '(no combination with other options)'), None, 'store_true', False, 'I'), 'reportnodeinfo': ('Report node information (no combination with other options)', None, 'store_true', False, 'R'), 'moab': ('Use moab information (mdiag -n)', None, 'store_true', False, 'm'), 'moabxml': ('Use xml moab data from file (for testing)', None, 'store', None), 'shorthost': ('Return (short) hostname', None, 'store_true', False, 's'), 'invert': ('Return inverted selection', None, 'store_true', False, 'v'), } go = simple_option(options) if go.options.nagios and not go.options.debug: fancylogger.logToDevLog(enable=True) fancylogger.logToScreen(enable=False) fancylogger.setLogLevelInfo() all_states = ND_NAGIOS_CRITICAL + ND_NAGIOS_WARNING + ND_NAGIOS_OK report_states = [] if go.options.down: report_states.append(ND_down) if go.options.downonerror: report_states.append(ND_down_on_error) if go.options.offline: report_states.append(ND_offline) if go.options.free: report_states.append(ND_free) if go.options.partial: report_states.append(ND_free_and_job) if go.options.job_exclusive: report_states.append(ND_job_exclusive) if go.options.unknown: report_states.append(ND_state_unknown) if go.options.bad: report_states.append(ND_bad) if go.options.error: report_states.append(ND_error) if go.options.idle: report_states.append(ND_idle) if go.options.offline_idle: report_states.append(ND_offline_idle) if len(report_states) == 0: report_states = all_states if go.options.singlenodeinfo or go.options.reportnodeinfo: nodeinfo = collect_nodeinfo()[2] if len(nodeinfo) == 0: _log.error('No nodeinfo found') sys.exit(1) ordered = sorted(nodeinfo.items(), key=lambda x: len(x[1]), reverse=True) if go.options.singlenodeinfo: if len(nodeinfo) > 1: msg = "Not all nodes have same parameters. Using most frequent ones." if go.options.reportnodeinfo: _log.warning(msg) else: _log.error(msg) # usage: export `./show_nodes -I` ; env |grep SHOWNODES_ most_freq = ordered[0][0] msg = [] msg.append("SHOWNODES_PPN=%d" % most_freq[0]) msg.append("SHOWNODES_PHYSMEMMB=%d" % (most_freq[1] * 1024)) else: msg = [] for info, nodes in ordered: txt = "%d nodes with %d cores, %s MB physmem, %s GB swap and %s GB local disk" % ( len(nodes), info[0], info[1] * 1024, info[2], info[3]) msg.append(txt) # print and _log are dumped to stdout at different moment, repeat the txt in the debug log _log.debug("Found %s with matching nodes: %s" % (txt, nodes)) print "\n".join(msg) sys.exit(0) if go.options.moab: if go.options.moabxml: try: moabxml = open(go.options.moabxml).read() except (OSError, IOError): _log.error('Failed to read moab xml from %s' % go.options.moabxml) else: moabxml = None nodes_dict = moab_get_nodes_dict(xml=moabxml) nodes = get_nodes(nodes_dict) else: nodes = get_nodes() nagiosexit = { NDNAG_WARNING: warning_exit, NDNAG_CRITICAL: critical_exit, NDNAG_OK: ok_exit, } nagios_res = {} detailed_res = {} nodes_found = [] all_nodes = [] for name, full_state in nodes: all_nodes.append(name) if go.options.regex and not go.options.regex.search(name): continue nagios_state = full_state['derived']['nagiosstate'] if nagios_state not in nagios_res: nagios_res[nagios_state] = [] state = full_state['derived']['state'] states = full_state['derived']['states'] if state == ND_free and ND_idle in states: state = ND_idle # special case for idle if state == ND_offline and ND_idle in states: state = ND_offline_idle if state not in detailed_res: detailed_res[state] = [] if go.options.anystate: states_to_check = states else: states_to_check = [state] # filter the allowed states if any(x for x in states_to_check if x in report_states): nagios_res[nagios_state].append(states) detailed_res[state].append(states) nodes_found.append(name) if go.options.regex and not go.options.allregex: break if go.options.invert: nodes_found = [x for x in all_nodes if x not in nodes_found] if go.options.regex and not go.options.allregex: # there should only be one node nagios_state, all_states = nagios_res.items()[0] states = all_states[0] if go.options.nagios: msg = "show_nodes - %s" % ",".join(states) nagiosexit[nagios_state](msg) else: txt = "%s %s" % (nagios_state, ",".join(states)) print txt else: if go.options.nagios: msg = NagiosResult('show_nodes') txt = [] total = 0 for state in all_states: if state in detailed_res: nr = len(detailed_res[state]) else: nr = 0 total += nr setattr(msg, state, nr) msg.total = total reported_state = [str(NDNAG_OK), ''] if ND_bad in detailed_res: reported_state[0] = NDNAG_CRITICAL msg.message += ' - %s bad nodes' % (len(detailed_res[ND_bad])) nagiosexit[reported_state[0]](msg) else: # just print the nodes if go.options.shorthost: nodes_found = [x.split('.')[0] for x in nodes_found] print ' '.join(nodes_found)
next_idx = (idx + 1) % len(recvbuf) if recvbuf[idx]['hostname'] == recvbuf[next_idx]['hostname']: if not recvbuf[idx]['affinity'][ -1] == recvbuf[next_idx]['affinity'][0] - 1: log.error( "No nn on same node for rank %s (aff %s) and next rank %s (aff %s)" % (idx, recvbuf[idx]['affinity'], next_idx, recvbuf[next_idx]['affinity'])) else: if not recvbuf[next_idx]['affinity'][0] == 0: log.error( "No nn on different nodes for rank %s (hn %s aff %s) and next rank %s (hn %s aff %s)" % (idx, recvbuf[idx]['hostname'], recvbuf[idx]['affinity'], next_idx, recvbuf[next_idx]['hostname'], recvbuf[next_idx]['affinity'])) if __name__ == '__main__': log = getLogger('mympisanity') setLogLevelInfo() log.info("mympisanity started") comm = MPI.COMM_WORLD ## gather the info from all processes recvbuf = comm.gather(Report(), 0) log.info("mympisanity gather report finished") if comm.rank == 0: check()
def get_mpi_and_sched_and_options(): """ Selects mpi flavor and scheduler based on environment and arguments. @return: A triplet containing the chosen mpi flavor, chosen scheduler and the MympirunOption class. """ scriptname = os.path.basename(os.path.abspath(sys.argv[0])) # if the scriptname is 'mpirun', its means that mympirun was called through the faked mpirun path isfake = scriptname == 'mpirun' # init generaloption with the various mpirun cli options optionparser = MympirunOption(ismpirun=isfake) # see if an mpi flavor was explicitly chosen as a command line argument. If not, just use the mpirun that was called # We are using sys.argv because generaloption depends on the the returned scriptname if optionparser.options.setmpi: setmpi = optionparser.options.setmpi optionparser.log.debug( "mympirun has been forced to use %s as MPI flavor", setmpi) else: setmpi = sys.argv[0] optionparser.log.debug("mympirun will be executed by %s", setmpi) scriptname, mpi, found_mpi = mpim.what_mpi(setmpi) found_mpi_names = [x.__name__ for x in found_mpi] if optionparser.options.showmpi: fancylogger.setLogLevelInfo() optionparser.log.info("Found MPI classes %s", ", ".join(found_mpi_names)) return # Select a Scheduler from the available schedulers sched, found_sched = schedm.what_sched( getattr(optionparser.options, 'setsched', None)) found_sched_names = [x.__name__ for x in found_sched] if optionparser.options.showsched: fancylogger.setLogLevelInfo() optionparser.log.info("Found Sched classes %s", ", ".join(found_sched_names)) return if mpi is None: optionparser.log.raiseException(( "No MPI class found that supports scriptname %s; isfake %s). Please use " "mympirun through one of the direct calls or make sure the mpirun command can" " be found. Found MPI %s") % (scriptname, isfake, ", ".join(found_mpi_names))) else: optionparser.log.debug("Found MPI class %s (scriptname %s; isfake %s)", mpi.__name__, scriptname, isfake) if sched is None: optionparser.log.raiseException( "No sched class found (options.setsched %s ; found Sched classes %s)", optionparser.options.setsched, ", ".join(found_sched_names)) else: optionparser.log.debug( "Found sched class %s from options.setsched %s (all Sched found %s)", sched.__name__, optionparser.options.setsched, ", ".join(found_sched_names)) if not optionparser.args: optionparser.log.warn("no mpi script provided") return return mpi, sched, optionparser