Example #1
0
def get_mpi_and_sched_and_options():
    """
    Selects mpi flavor and scheduler based on environment and arguments.

    @return: A triplet containing the chosen mpi flavor, chosen scheduler and the MympirunOption class.
    """

    scriptname = os.path.basename(os.path.abspath(sys.argv[0]))
    # if the scriptname is 'mpirun', its means that mympirun was called through the faked mpirun path
    isfake = scriptname == 'mpirun'

    # init generaloption with the various mpirun cli options
    optionparser = MympirunOption(ismpirun=isfake)

    # see if an mpi flavor was explicitly chosen as a command line argument. If not, just use the mpirun that was called
    # We are using sys.argv because generaloption depends on the the returned scriptname
    if optionparser.options.setmpi:
        setmpi = optionparser.options.setmpi
        optionparser.log.debug("mympirun has been forced to use %s as MPI flavor", setmpi)
    else:
        setmpi = sys.argv[0]
        optionparser.log.debug("mympirun will be executed by %s", setmpi)

    scriptname, mpi, found_mpi = mpim.what_mpi(setmpi)
    found_mpi_names = [x.__name__ for x in found_mpi]

    if optionparser.options.showmpi:
        fancylogger.setLogLevelInfo()
        optionparser.log.info("Found MPI classes %s", ", ".join(found_mpi_names))
        return

    # Select a Scheduler from the available schedulers
    sched, found_sched = schedm.what_sched(getattr(optionparser.options, 'setsched', None))
    found_sched_names = [x.__name__ for x in found_sched]

    if optionparser.options.showsched:
        fancylogger.setLogLevelInfo()
        optionparser.log.info("Found Sched classes %s", ", ".join(found_sched_names))
        return

    if mpi is None:
        optionparser.log.raiseException(("No MPI class found that supports scriptname %s; isfake %s). Please use "
                                         "mympirun through one of the direct calls or make sure the mpirun command can"
                                         " be found. Found MPI %s") %
                                        (scriptname, isfake, ", ".join(found_mpi_names)))
    else:
        optionparser.log.debug("Found MPI class %s (scriptname %s; isfake %s)", mpi.__name__, scriptname, isfake)

    if sched is None:
        optionparser.log.raiseException("No sched class found (options.setsched %s ; found Sched classes %s)",
                                        optionparser.options.setsched, ", ".join(found_sched_names))
    else:
        optionparser.log.debug("Found sched class %s from options.setsched %s (all Sched found %s)",
                               sched.__name__, optionparser.options.setsched, ", ".join(found_sched_names))

    if not optionparser.args:
        optionparser.log.warn("no mpi script provided")
        return

    return mpi, sched, optionparser
Example #2
0
def get_mpi_and_sched_and_options():
    """Parses the mpi and scheduler based on current environment and guesses the best one to use"""
    scriptname, mpi, found_mpi = whatMPI(sys.argv[0])

    ismpirun = scriptname == 'mpirun'

    mo = MympirunOption(ismpirun=ismpirun)

    if mo.args is None or len(mo.args) == 0:
        mo.parser.print_shorthelp()
        raise ExitException("Exit no args provided")

    sched, found_sched = whatSched(getattr(mo.options, 'schedtype', None))

    found_mpi_names = [x.__name__ for x in found_mpi]
    found_sched_names = [x.__name__ for x in found_sched]

    if mo.options.showmpi:
        fancylogger.setLogLevelInfo()
        _logger.info("Found MPI classes %s" % (", ".join(found_mpi_names)))
        raise ExitException("Exit from showmpi")

    if mo.options.showsched:
        fancylogger.setLogLevelInfo()
        _logger.info("Found Sched classes %s" % (", ".join(found_sched_names)))
        raise ExitException("Exit from showsched")

    if mpi is None:
        mo.parser.print_shorthelp()
        mo.log.raiseException((
            "No MPI class found (scriptname %s; ismpirun %s). Please use mympirun through one "
            "of the direct calls or make sure the mpirun command can be found. "
            "Found MPI %s") % (scriptname, ismpirun,
                               ", ".join(found_mpi_names)))
    else:
        mo.log.debug("Found MPI class %s (scriptname %s; ismpirun %s)" %
                     (mpi.__name__, scriptname, ismpirun))

    if sched is None:
        mo.log.raiseException(
            "No sched class found (options.schedtype %s ; found Sched classes %s)"
            % (mo.options.schedtype, ", ".join(found_sched_names)))
    else:
        mo.log.debug(
            "Found sched class %s from options.schedtype %s (all Sched found %s)"
            % (sched.__name__, mo.options.schedtype,
               ", ".join(found_sched_names)))

    return mpi, sched, mo
Example #3
0
def getInstance():
    """Make an instance of the relevant MPI class. Also set the RM instance"""
    scriptname, mpi, found_mpi = whatMPI(sys.argv[0])

    ismpirun = scriptname == 'mpirun'

    mo = MympirunOption(ismpirun=ismpirun)

    if mo.args is None or len(mo.args) == 0:
        mo.parser.print_shorthelp()
        raise ExitException("Exit no args provided")

    sched, found_sched = whatSched(getattr(mo.options, 'schedtype', None))

    found_mpi_names = [x.__name__ for x in found_mpi]
    found_sched_names = [x.__name__ for x in found_sched]

    if mo.options.showmpi:
        setLogLevelInfo()
        _logger.info("Found MPI classes %s" % (", ".join(found_mpi_names)))
        raise ExitException("Exit from showmpi")

    if mo.options.showsched:
        setLogLevelInfo()
        _logger.info("Found Sched classes %s" % (", ".join(found_sched_names)))
        raise ExitException("Exit from showsched")

    if mpi is None:
        mo.parser.print_shorthelp()
        mo.log.raiseException(("No MPI class found (scriptname %s; ismpirun %s). Please use mympirun through one "
                               "of the direct calls or make sure the mpirun command can be found. "
                               "Found MPI %s") % (scriptname, ismpirun, ", ".join(found_mpi_names)))
    else:
        mo.log.debug("Found MPI class %s (scriptname %s; ismpirun %s)" % (mpi.__name__, scriptname, ismpirun))

    if sched is None:
        mo.log.raiseException("No sched class found (options.schedtype %s ; found Sched classes %s)" %
                              (mo.options.schedtype, ", ".join(found_sched_names)))
    else:
        mo.log.debug("Found sched class %s from options.schedtype %s (all Sched found %s)" %
                     (sched.__name__, mo.options.schedtype, ", ".join(found_sched_names)))

    class M(mpi, sched):
        """Temporary class to couple MPI and local sched"""
        def __init__(self, **kwargs):
            self.log = getLogger("%s_%s" % (mpi.__name__, sched.__name__))
            super(M, self).__init__(**kwargs)

    return M(options=mo.options, cmdargs=mo.args)
Example #4
0
def get_mpi_and_sched_and_options():
    """Parses the mpi and scheduler based on current environment and guesses the best one to use"""
    scriptname, mpi, found_mpi = whatMPI(sys.argv[0])

    ismpirun = scriptname == 'mpirun'

    mo = MympirunOption(ismpirun=ismpirun)

    if mo.args is None or len(mo.args) == 0:
        mo.parser.print_shorthelp()
        raise ExitException("Exit no args provided")

    sched, found_sched = whatSched(getattr(mo.options, 'schedtype', None))

    found_mpi_names = [x.__name__ for x in found_mpi]
    found_sched_names = [x.__name__ for x in found_sched]

    if mo.options.showmpi:
        fancylogger.setLogLevelInfo()
        _logger.info("Found MPI classes %s" % (", ".join(found_mpi_names)))
        raise ExitException("Exit from showmpi")

    if mo.options.showsched:
        fancylogger.setLogLevelInfo()
        _logger.info("Found Sched classes %s" % (", ".join(found_sched_names)))
        raise ExitException("Exit from showsched")

    if mpi is None:
        mo.parser.print_shorthelp()
        mo.log.raiseException(("No MPI class found (scriptname %s; ismpirun %s). Please use mympirun through one "
                               "of the direct calls or make sure the mpirun command can be found. "
                               "Found MPI %s") % (scriptname, ismpirun, ", ".join(found_mpi_names)))
    else:
        mo.log.debug("Found MPI class %s (scriptname %s; ismpirun %s)" % (mpi.__name__, scriptname, ismpirun))

    if sched is None:
        mo.log.raiseException("No sched class found (options.schedtype %s ; found Sched classes %s)" %
                              (mo.options.schedtype, ", ".join(found_sched_names)))
    else:
        mo.log.debug("Found sched class %s from options.schedtype %s (all Sched found %s)" %
                     (sched.__name__, mo.options.schedtype, ", ".join(found_sched_names)))

    return mpi, sched, mo
Example #5
0
    def _stream_stdouterr(self, isstdout=True, expect_match=True):
        """Log to stdout or stderror, check stdout or stderror"""
        fd, logfn = tempfile.mkstemp()
        # fh will be checked
        fh = os.fdopen(fd, 'w')

        _stdout = sys.stdout
        _stderr = sys.stderr

        if isstdout == expect_match:
            sys.stdout = fh
            sys.stderr = open(os.devnull, 'w')
        else:
            sys.stdout = open(os.devnull, 'w')
            sys.stderr = fh

        fancylogger.setLogLevelInfo()
        name = 'test_stream_stdout'
        lh = fancylogger.logToScreen(stdout=isstdout)
        logger = fancylogger.getLogger(name, fname=True, clsname=False)
        # logfn makes it unique
        msg = 'TEST isstdout %s expect_match %s logfn %s' % (
            isstdout, expect_match, logfn)
        logger.info(msg)

        # restore
        fancylogger.logToScreen(enable=False, handler=lh)
        sys.stdout = _stdout
        sys.stderr = _stderr

        fh2 = open(logfn)
        txt = fh2.read().strip()
        fh2.close()
        reg_exp = re.compile(r"INFO\s+\S+.%s.%s\s+\S+\s+%s" %
                             (name, '_stream_stdouterr', msg))
        match = reg_exp.search(txt) is not None
        self.assertEqual(match, expect_match)

        try:
            os.remove(logfn)
        except:
            pass
Example #6
0
    def _stream_stdouterr(self, isstdout=True, expect_match=True):
        """Log to stdout or stderror, check stdout or stderror"""
        fd, logfn = tempfile.mkstemp()
        # fh will be checked
        fh = os.fdopen(fd, 'w')

        _stdout = sys.stdout
        _stderr = sys.stderr

        if isstdout == expect_match:
            sys.stdout = fh
            sys.stderr = open(os.devnull, 'w')
        else:
            sys.stdout = open(os.devnull, 'w')
            sys.stderr = fh

        fancylogger.setLogLevelInfo()
        name = 'test_stream_stdout'
        lh = fancylogger.logToScreen(stdout=isstdout)
        logger = fancylogger.getLogger(name, fname=True, clsname=False)
        # logfn makes it unique
        msg = 'TEST isstdout %s expect_match %s logfn %s' % (isstdout, expect_match, logfn)
        logger.info(msg)

        # restore
        fancylogger.logToScreen(enable=False, handler=lh)
        sys.stdout = _stdout
        sys.stderr = _stderr

        fh2 = open(logfn)
        txt = fh2.read().strip()
        fh2.close()
        reg_exp = re.compile(r"INFO\s+\S+.%s.%s\s+\S+\s+%s" % (name, '_stream_stdouterr', msg))
        match = reg_exp.search(txt) is not None
        self.assertEqual(match, expect_match)

        try:
            os.remove(logfn)
        except:
            pass
def main():
    """the main function"""
    fancylogger.logToScreen(enable=True, stdout=True)
    fancylogger.setLogLevelInfo()

    options = {
        'github-user': ('Your github username to use', None, 'store', None, 'g'),
        'closed-pr': ('Delete all gists from closed pull-requests', None, 'store_true', True, 'p'),
        'all': ('Delete all gists from Easybuild ', None, 'store_true', False, 'a'),
        'orphans': ('Delete all gists without a pull-request', None, 'store_true', False, 'o'),
    }

    go = simple_option(options)
    log = go.log

    if not (go.options.all or go.options.closed_pr or go.options.orphans):
        raise EasyBuildError("Please tell me what to do?")

    if go.options.github_user is None:
        eb_go = EasyBuildOptions(envvar_prefix='EASYBUILD', go_args=[])
        username = eb_go.options.github_user
        log.debug("Fetch github username from easybuild, found: %s", username)
    else:
        username = go.options.github_user

    if username is None:
        raise EasyBuildError("Could not find a github username")
    else:
        log.info("Using username = %s", username)

    token = fetch_github_token(username)

    gh = RestClient(GITHUB_API_URL, username=username, token=token)
    # ToDo: add support for pagination
    status, gists = gh.gists.get(per_page=100)

    if status != HTTP_STATUS_OK:
        raise EasyBuildError("Failed to get a lists of gists for user %s: error code %s, message = %s",
                             username, status, gists)
    else:
        log.info("Found %s gists", len(gists))

    regex = re.compile(r"(EasyBuild test report|EasyBuild log for failed build).*?(?:PR #(?P<PR>[0-9]+))?\)?$")

    pr_cache = {}
    num_deleted = 0

    for gist in gists:
        if not gist["description"]:
            continue
        re_pr_num = regex.search(gist["description"])
        delete_gist = False

        if re_pr_num:
            log.debug("Found a Easybuild gist (id=%s)", gist["id"])
            pr_num = re_pr_num.group("PR")
            if go.options.all:
                delete_gist = True
            elif pr_num and go.options.closed_pr:
                log.debug("Found Easybuild test report for PR #%s", pr_num)

                if pr_num not in pr_cache:
                    status, pr = gh.repos[GITHUB_EB_MAIN][GITHUB_EASYCONFIGS_REPO].pulls[pr_num].get()
                    if status != HTTP_STATUS_OK:
                        raise EasyBuildError("Failed to get pull-request #%s: error code %s, message = %s",
                                             pr_num, status, pr)
                    pr_cache[pr_num] = pr["state"]

                if pr_cache[pr_num] == "closed":
                    log.debug("Found report from closed PR #%s (id=%s)", pr_num, gist["id"])
                    delete_gist = True

            elif not pr_num and go.options.orphans:
                log.debug("Found Easybuild test report without PR (id=%s)", gist["id"])
                delete_gist = True

        if delete_gist:
            status, del_gist = gh.gists[gist["id"]].delete()

            if status != HTTP_DELETE_OK:
                raise EasyBuildError("Unable to remove gist (id=%s): error code %s, message = %s",
                                     gist["id"], status, del_gist)
            else:
                log.info("Delete gist with id=%s", gist["id"])
                num_deleted += 1

    log.info("Deleted %s gists", num_deleted)
    dest="path",
    help="Specify a path inside the repo (default easybuild/easyconfigs).")
parser.add_option("-l",
                  "--local",
                  action="store_true",
                  dest="local",
                  help="Use a local path, not on github.com (Default false)")

options, args = parser.parse_args()

# get and configure logger
log = fancylogger.getLogger(__name__)
if options.verbose == 1:
    fancylogger.setLogLevelWarning()
elif options.verbose == 2:
    fancylogger.setLogLevelInfo()
elif options.verbose >= 3:
    fancylogger.setLogLevelDebug()

if options.quiet:
    fancylogger.logToScreen(False)
else:
    fancylogger.logToScreen(True)

# other options
if not options.branch:
    options.branch = "develop"
if not options.username:
    options.username = "******"
if not options.repo:
    options.repo = "easybuild-easyconfigs"
Example #9
0
from vsc.administration.user import cluster_user_pickle_location_map, cluster_user_pickle_store_map
from vsc.accountpage.client import AccountpageClient
from vsc.config.base import VscStorage
from vsc.filesystem.gpfs import GpfsOperations
from vsc.jobs.moab.checkjob import SshCheckjob, CheckjobInfo
from vsc.utils import fancylogger
from vsc.utils.fs_store import store_on_gpfs
from vsc.utils.nagios import NAGIOS_EXIT_CRITICAL
from vsc.utils.script_tools import ExtendedSimpleOption

#Constants
NAGIOS_CHECK_INTERVAL_THRESHOLD = 30 * 60  # 30 minutes

logger = fancylogger.getLogger(__name__)
fancylogger.logToScreen(True)
fancylogger.setLogLevelInfo()

STORE_LIMIT_CRITICAL = 5

# FIXME: common
def get_pickle_path(location, user_id, rest_client):
    """Determine the path (directory) where the pickle file qith the queue information should be stored.

    @type location: string
    @type user_id: string

    @param location: indication of the user accesible storage spot to use, e.g., home or scratch
    @param user_id: VSC user ID
    @param rest_client: VscAccountpageClient instance

    @returns: tuple of (string representing the directory where the pickle file should be stored,
Example #10
0
        next_idx = (idx + 1) % len(recvbuf)
        if recvbuf[idx]['hostname'] == recvbuf[next_idx]['hostname']:
            if not recvbuf[idx]['affinity'][-1] == recvbuf[next_idx]['affinity'][0] - 1:
                log.error("No nn on same node for rank %s (aff %s) and next rank %s (aff %s)" %
                          (idx, recvbuf[idx]['affinity'], next_idx, recvbuf[next_idx]['affinity']))
        else:
            if not recvbuf[next_idx]['affinity'][0] == 0:
                log.error("No nn on different nodes for rank %s (hn %s aff %s) and next rank %s (hn %s aff %s)" %
                           (idx, recvbuf[idx]['hostname'], recvbuf[idx]['affinity'],
                            next_idx, recvbuf[next_idx]['hostname'], recvbuf[next_idx]['affinity'])
                          )


if __name__ == '__main__':
    log = getLogger('mympisanity')
    setLogLevelInfo()

    if MPI4PY_EXCEPTION:
        log.error("No mpi4py found: %s", MPI4PY_EXCEPTION)
        sys.exit(1)


    log.info("mympisanity started")

    comm = MPI.COMM_WORLD

    # gather the info from all processes
    recvbuf = comm.gather(Report(), 0)
    log.info("mympisanity gather report finished")

def main():
    """the main function"""
    fancylogger.logToScreen(enable=True, stdout=True)
    fancylogger.setLogLevelInfo()

    options = {
        'github-user': ('Your github username to use', None, 'store', None, 'g'),
        'closed-pr': ('Delete all gists from closed pull-requests', None, 'store_true', True, 'p'),
        'all': ('Delete all gists from Easybuild ', None, 'store_true', False, 'a'),
        'orphans': ('Delete all gists without a pull-request', None, 'store_true', False, 'o'),
    }

    go = simple_option(options)
    log = go.log

    if not (go.options.all or go.options.closed_pr or go.options.orphans):
        log.error("Please tell me what to do?")

    if go.options.github_user is None:
        eb_go = EasyBuildOptions(envvar_prefix='EASYBUILD', go_args=[])
        username = eb_go.options.github_user
        log.debug("Fetch github username from easybuild, found: %s", username)
    else:
        username = go.options.github_user

    if username is None:
        log.error("Could not find a github username")
    else:
        log.info("Using username = %s", username)

    token = fetch_github_token(username)

    gh = RestClient(GITHUB_API_URL, username=username, token=token)
    # ToDo: add support for pagination
    status, gists = gh.gists.get(per_page=100)

    if status != HTTP_STATUS_OK:
        log.error("Failed to get a lists of gists for user %s: error code %s, message = %s",
                  username, status, gists)
    else:
        log.info("Found %s gists", len(gists))

    regex = re.compile(r"(EasyBuild test report|EasyBuild log for failed build).*?(?:PR #(?P<PR>[0-9]+))?\)?$")

    pr_cache = {}
    num_deleted = 0

    for gist in gists:
        if not gist["description"]:
            continue
        re_pr_num = regex.search(gist["description"])
        delete_gist = False

        if re_pr_num:
            log.debug("Found a Easybuild gist (id=%s)", gist["id"])
            pr_num = re_pr_num.group("PR")
            if go.options.all:
                delete_gist = True
            elif pr_num and go.options.closed_pr:
                log.debug("Found Easybuild test report for PR #%s", pr_num)

                if pr_num not in pr_cache:
                    status, pr = gh.repos[GITHUB_EB_MAIN][GITHUB_EASYCONFIGS_REPO].pulls[pr_num].get()
                    if status != HTTP_STATUS_OK:
                        log.error("Failed to get pull-request #%s: error code %s, message = %s",
                                  pr_num, status, pr)
                    pr_cache[pr_num] = pr["state"]

                if pr_cache[pr_num] == "closed":
                    log.debug("Found report from closed PR #%s (id=%s)", pr_num, gist["id"])
                    delete_gist = True

            elif not pr_num and go.options.orphans:
                log.debug("Found Easybuild test report without PR (id=%s)", gist["id"])
                delete_gist = True

        if delete_gist:
            status, del_gist = gh.gists[gist["id"]].delete()

            if status != HTTP_DELETE_OK:
                log.error("Unable to remove gist (id=%s): error code %s, message = %s",
                          gist["id"], status, del_gist)
            else:
                log.info("Delete gist with id=%s", gist["id"])
                num_deleted += 1

    log.info("Deleted %s gists", num_deleted)
Example #12
0
def main():
    """Main"""

    options = {
        'nagios': ('Report in nagios format', None, 'store_true', False, 'n'),
        'regex': ('Filter on regexp, data for first match', None, 'regex', None, 'r'),
        'allregex': ('Combined with --regex/-r, return all data', None, 'store_true', False, 'A'),
        'anystate': ('Matches any state (eg down_on_error node will also list as error)',
                     None, 'store_true', False, 'a'),
        'down': ('Down nodes', None, 'store_true', False, 'D'),
        'downonerror': ('Down on error nodes', None, 'store_true', False, 'E'),
        'offline': ('Offline nodes', None, 'store_true', False, 'o'),
        'partial': ('Partial nodes (one or more running job(s), jobslot(s) available)', None, 'store_true', False, 'p'),
        'job-exclusive': ('Job-exclusive nodes (no jobslots available)', None, 'store_true', False, 'x'),
        'free': ('Free nodes (0 or more running jobs, jobslot(s) available)', None, 'store_true', False, 'f'),
        'unknown': ('State unknown nodes', None, 'store_true', False, 'u'),
        'bad': ('Bad nodes (broken jobregex)', None, 'store_true', False, 'b'),
        'error': ('Error nodes', None, 'store_true', False, 'e'),
        'idle': ('Idle nodes (No running jobs, jobslot(s) available)', None, 'store_true', False, 'i'),
        'singlenodeinfo': (('Single (most-frequent) node information in key=value format'
                            '(no combination with other options)'), None, 'store_true', False, 'I'),
        'reportnodeinfo': ('Report node information (no combination with other options)',
                           None, 'store_true', False, 'R'),
        'moab': ('Use moab information (mdiag -n)', None, 'store_true', False, 'm'),
        'moabxml': ('Use xml moab data from file (for testing)', None, 'store', None),
        'shorthost': ('Return (short) hostname', None, 'store_true', False, 's'),
        'invert': ('Return inverted selection', None, 'store_true', False, 'v'),
        }

    go = simple_option(options)

    if go.options.nagios and not go.options.debug:
        fancylogger.logToDevLog(enable=True)
        fancylogger.logToScreen(enable=False)
        fancylogger.setLogLevelInfo()

    all_states = ND_NAGIOS_CRITICAL + ND_NAGIOS_WARNING + ND_NAGIOS_OK
    report_states = []
    if go.options.down:
        report_states.append(ND_down)
    if go.options.downonerror:
        report_states.append(ND_down_on_error)
    if go.options.offline:
        report_states.append(ND_offline)
    if go.options.free:
        report_states.append(ND_free)
    if go.options.partial:
        report_states.append(ND_free_and_job)
    if go.options.job_exclusive:
        report_states.append(ND_job_exclusive)
    if go.options.unknown:
        report_states.append(ND_state_unknown)
    if go.options.bad:
        report_states.append(ND_bad)
    if go.options.error:
        report_states.append(ND_error)
    if go.options.idle:
        report_states.append(ND_idle)

    if len(report_states) == 0:
        report_states = all_states

    if go.options.singlenodeinfo or go.options.reportnodeinfo:
        nodeinfo = collect_nodeinfo()[2]
        if len(nodeinfo) == 0:
            _log.error('No nodeinfo found')
            sys.exit(1)

        ordered = sorted(nodeinfo.items(), key=lambda x: len(x[1]), reverse=True)

        if go.options.singlenodeinfo:
            if len(nodeinfo) > 1:
                msg = "Not all nodes have same parameters. Using most frequent ones."
                if go.options.reportnodeinfo:
                    _log.warning(msg)
                else:
                    _log.error(msg)

            # usage: export `./show_nodes -I` ; env |grep SHOWNODES_
            most_freq = ordered[0][0]
            msg = []
            msg.append("SHOWNODES_PPN=%d" % most_freq[0])
            msg.append("SHOWNODES_PHYSMEMMB=%d" % (most_freq[1] * 1024))
        else:
            msg = []
            for info, nodes in ordered:
                txt = "%d nodes with %d cores, %s MB physmem, %s GB swap and %s GB local disk" % (
                    len(nodes), info[0], info[1] * 1024, info[2], info[3])
                msg.append(txt)
                # print and _log are dumped to stdout at different moment, repeat the txt in the debug log
                _log.debug("Found %s with matching nodes: %s" % (txt, nodes))

        print "\n".join(msg)
        sys.exit(0)

    if go.options.moab:

        if go.options.moabxml:
            try:
                moabxml = open(go.options.moabxml).read()
            except (OSError, IOError):
                _log.error('Failed to read moab xml from %s' % go.options.moabxml)
        else:
            moabxml = None
        nodes_dict = moab_get_nodes_dict(xml=moabxml)

        nodes = get_nodes(nodes_dict)
    else:
        nodes = get_nodes()

    nagiosexit = {
        NDNAG_WARNING: warning_exit,
        NDNAG_CRITICAL: critical_exit,
        NDNAG_OK: ok_exit,
    }

    nagios_res = {}
    detailed_res = {}
    nodes_found = []

    all_nodes = []

    for name, full_state in nodes:
        all_nodes.append(name)

        if go.options.regex and not go.options.regex.search(name):
            continue

        nagios_state = full_state['derived']['nagiosstate']
        if nagios_state not in nagios_res:
            nagios_res[nagios_state] = []

        state = full_state['derived']['state']
        states = full_state['derived']['states']

        if state == ND_free and ND_idle in states:
            state = ND_idle  # special case for idle
        if state not in detailed_res:
            detailed_res[state] = []

        if go.options.anystate:
            states_to_check = states
        else:
            states_to_check = [state]

        # filter the allowed states
        if any(x for x in states_to_check if x in report_states):
            nagios_res[nagios_state].append(states)
            detailed_res[state].append(states)
            nodes_found.append(name)

            if go.options.regex and not go.options.allregex:
                break

    if go.options.invert:
        nodes_found = [x for x in all_nodes if x not in nodes_found]

    if go.options.regex and not go.options.allregex:
        # there should only be one node
        nagios_state, all_states = nagios_res.items()[0]
        states = all_states[0]
        if go.options.nagios:
            msg = "show_nodes - %s" % ",".join(states)
            nagiosexit[nagios_state](msg)
        else:
            txt = "%s %s" % (nagios_state, ",".join(states))
            print txt
    else:
        if go.options.nagios:
            msg = NagiosResult('show_nodes')
            txt = []
            total = 0
            for state in all_states:
                if state in detailed_res:
                    nr = len(detailed_res[state])
                else:
                    nr = 0
                total += nr
                setattr(msg, state, nr)
            msg.total = total

            reported_state = [str(NDNAG_OK), '']
            if ND_bad in detailed_res:
                reported_state[0] = NDNAG_CRITICAL
                msg.message += ' - %s bad nodes' % (len(detailed_res[ND_bad]))
            nagiosexit[reported_state[0]](msg)
        else:
            # just print the nodes
            if go.options.shorthost:
                nodes_found = [x.split('.')[0] for x in nodes_found]
            print ' '.join(nodes_found)
Example #13
0
def main():
    """Main"""

    options = {
        'nagios': ('Report in nagios format', None, 'store_true', False, 'n'),
        'regex': ('Filter on regexp, data for first match', None, 'regex',
                  None, 'r'),
        'allregex': ('Combined with --regex/-r, return all data', None,
                     'store_true', False, 'A'),
        'anystate':
        ('Matches any state (eg down_on_error node will also list as error)',
         None, 'store_true', False, 'a'),
        'down': ('Down nodes', None, 'store_true', False, 'D'),
        'downonerror': ('Down on error nodes', None, 'store_true', False, 'E'),
        'offline': ('Offline nodes', None, 'store_true', False, 'o'),
        'offline_idle': ('Offline idle nodes', None, 'store_true', False, 'O'),
        'partial':
        ('Partial nodes (one or more running job(s), jobslot(s) available)',
         None, 'store_true', False, 'p'),
        'job-exclusive': ('Job-exclusive nodes (no jobslots available)', None,
                          'store_true', False, 'x'),
        'free': ('Free nodes (0 or more running jobs, jobslot(s) available)',
                 None, 'store_true', False, 'f'),
        'unknown': ('State unknown nodes', None, 'store_true', False, 'u'),
        'bad': ('Bad nodes (broken jobregex)', None, 'store_true', False, 'b'),
        'error': ('Error nodes', None, 'store_true', False, 'e'),
        'idle': ('Idle nodes (No running jobs, jobslot(s) available)', None,
                 'store_true', False, 'i'),
        'singlenodeinfo':
        (('Single (most-frequent) node information in key=value format'
          '(no combination with other options)'), None, 'store_true', False,
         'I'),
        'reportnodeinfo':
        ('Report node information (no combination with other options)', None,
         'store_true', False, 'R'),
        'moab': ('Use moab information (mdiag -n)', None, 'store_true', False,
                 'm'),
        'moabxml': ('Use xml moab data from file (for testing)', None, 'store',
                    None),
        'shorthost': ('Return (short) hostname', None, 'store_true', False,
                      's'),
        'invert': ('Return inverted selection', None, 'store_true', False,
                   'v'),
    }

    go = simple_option(options)

    if go.options.nagios and not go.options.debug:
        fancylogger.logToDevLog(enable=True)
        fancylogger.logToScreen(enable=False)
        fancylogger.setLogLevelInfo()

    all_states = ND_NAGIOS_CRITICAL + ND_NAGIOS_WARNING + ND_NAGIOS_OK
    report_states = []
    if go.options.down:
        report_states.append(ND_down)
    if go.options.downonerror:
        report_states.append(ND_down_on_error)
    if go.options.offline:
        report_states.append(ND_offline)
    if go.options.free:
        report_states.append(ND_free)
    if go.options.partial:
        report_states.append(ND_free_and_job)
    if go.options.job_exclusive:
        report_states.append(ND_job_exclusive)
    if go.options.unknown:
        report_states.append(ND_state_unknown)
    if go.options.bad:
        report_states.append(ND_bad)
    if go.options.error:
        report_states.append(ND_error)
    if go.options.idle:
        report_states.append(ND_idle)
    if go.options.offline_idle:
        report_states.append(ND_offline_idle)

    if len(report_states) == 0:
        report_states = all_states

    if go.options.singlenodeinfo or go.options.reportnodeinfo:
        nodeinfo = collect_nodeinfo()[2]
        if len(nodeinfo) == 0:
            _log.error('No nodeinfo found')
            sys.exit(1)

        ordered = sorted(nodeinfo.items(),
                         key=lambda x: len(x[1]),
                         reverse=True)

        if go.options.singlenodeinfo:
            if len(nodeinfo) > 1:
                msg = "Not all nodes have same parameters. Using most frequent ones."
                if go.options.reportnodeinfo:
                    _log.warning(msg)
                else:
                    _log.error(msg)

            # usage: export `./show_nodes -I` ; env |grep SHOWNODES_
            most_freq = ordered[0][0]
            msg = []
            msg.append("SHOWNODES_PPN=%d" % most_freq[0])
            msg.append("SHOWNODES_PHYSMEMMB=%d" % (most_freq[1] * 1024))
        else:
            msg = []
            for info, nodes in ordered:
                txt = "%d nodes with %d cores, %s MB physmem, %s GB swap and %s GB local disk" % (
                    len(nodes), info[0], info[1] * 1024, info[2], info[3])
                msg.append(txt)
                # print and _log are dumped to stdout at different moment, repeat the txt in the debug log
                _log.debug("Found %s with matching nodes: %s" % (txt, nodes))

        print "\n".join(msg)
        sys.exit(0)

    if go.options.moab:

        if go.options.moabxml:
            try:
                moabxml = open(go.options.moabxml).read()
            except (OSError, IOError):
                _log.error('Failed to read moab xml from %s' %
                           go.options.moabxml)
        else:
            moabxml = None
        nodes_dict = moab_get_nodes_dict(xml=moabxml)

        nodes = get_nodes(nodes_dict)
    else:
        nodes = get_nodes()

    nagiosexit = {
        NDNAG_WARNING: warning_exit,
        NDNAG_CRITICAL: critical_exit,
        NDNAG_OK: ok_exit,
    }

    nagios_res = {}
    detailed_res = {}
    nodes_found = []

    all_nodes = []

    for name, full_state in nodes:
        all_nodes.append(name)

        if go.options.regex and not go.options.regex.search(name):
            continue

        nagios_state = full_state['derived']['nagiosstate']
        if nagios_state not in nagios_res:
            nagios_res[nagios_state] = []

        state = full_state['derived']['state']
        states = full_state['derived']['states']

        if state == ND_free and ND_idle in states:
            state = ND_idle  # special case for idle
        if state == ND_offline and ND_idle in states:
            state = ND_offline_idle
        if state not in detailed_res:
            detailed_res[state] = []

        if go.options.anystate:
            states_to_check = states
        else:
            states_to_check = [state]

        # filter the allowed states
        if any(x for x in states_to_check if x in report_states):
            nagios_res[nagios_state].append(states)
            detailed_res[state].append(states)
            nodes_found.append(name)

            if go.options.regex and not go.options.allregex:
                break

    if go.options.invert:
        nodes_found = [x for x in all_nodes if x not in nodes_found]

    if go.options.regex and not go.options.allregex:
        # there should only be one node
        nagios_state, all_states = nagios_res.items()[0]
        states = all_states[0]
        if go.options.nagios:
            msg = "show_nodes - %s" % ",".join(states)
            nagiosexit[nagios_state](msg)
        else:
            txt = "%s %s" % (nagios_state, ",".join(states))
            print txt
    else:
        if go.options.nagios:
            msg = NagiosResult('show_nodes')
            txt = []
            total = 0
            for state in all_states:
                if state in detailed_res:
                    nr = len(detailed_res[state])
                else:
                    nr = 0
                total += nr
                setattr(msg, state, nr)
            msg.total = total

            reported_state = [str(NDNAG_OK), '']
            if ND_bad in detailed_res:
                reported_state[0] = NDNAG_CRITICAL
                msg.message += ' - %s bad nodes' % (len(detailed_res[ND_bad]))
            nagiosexit[reported_state[0]](msg)
        else:
            # just print the nodes
            if go.options.shorthost:
                nodes_found = [x.split('.')[0] for x in nodes_found]
            print ' '.join(nodes_found)
Example #14
0
        next_idx = (idx + 1) % len(recvbuf)
        if recvbuf[idx]['hostname'] == recvbuf[next_idx]['hostname']:
            if not recvbuf[idx]['affinity'][
                    -1] == recvbuf[next_idx]['affinity'][0] - 1:
                log.error(
                    "No nn on same node for rank %s (aff %s) and next rank %s (aff %s)"
                    % (idx, recvbuf[idx]['affinity'], next_idx,
                       recvbuf[next_idx]['affinity']))
        else:
            if not recvbuf[next_idx]['affinity'][0] == 0:
                log.error(
                    "No nn on different nodes for rank %s (hn %s aff %s) and next rank %s (hn %s aff %s)"
                    % (idx, recvbuf[idx]['hostname'], recvbuf[idx]['affinity'],
                       next_idx, recvbuf[next_idx]['hostname'],
                       recvbuf[next_idx]['affinity']))


if __name__ == '__main__':
    log = getLogger('mympisanity')
    setLogLevelInfo()
    log.info("mympisanity started")

    comm = MPI.COMM_WORLD

    ## gather the info from all processes
    recvbuf = comm.gather(Report(), 0)
    log.info("mympisanity gather report finished")

    if comm.rank == 0:
        check()
Example #15
0
def get_mpi_and_sched_and_options():
    """
    Selects mpi flavor and scheduler based on environment and arguments.

    @return: A triplet containing the chosen mpi flavor, chosen scheduler and the MympirunOption class.
    """

    scriptname = os.path.basename(os.path.abspath(sys.argv[0]))
    # if the scriptname is 'mpirun', its means that mympirun was called through the faked mpirun path
    isfake = scriptname == 'mpirun'

    # init generaloption with the various mpirun cli options
    optionparser = MympirunOption(ismpirun=isfake)

    # see if an mpi flavor was explicitly chosen as a command line argument. If not, just use the mpirun that was called
    # We are using sys.argv because generaloption depends on the the returned scriptname
    if optionparser.options.setmpi:
        setmpi = optionparser.options.setmpi
        optionparser.log.debug(
            "mympirun has been forced to use %s as MPI flavor", setmpi)
    else:
        setmpi = sys.argv[0]
        optionparser.log.debug("mympirun will be executed by %s", setmpi)

    scriptname, mpi, found_mpi = mpim.what_mpi(setmpi)
    found_mpi_names = [x.__name__ for x in found_mpi]

    if optionparser.options.showmpi:
        fancylogger.setLogLevelInfo()
        optionparser.log.info("Found MPI classes %s",
                              ", ".join(found_mpi_names))
        return

    # Select a Scheduler from the available schedulers
    sched, found_sched = schedm.what_sched(
        getattr(optionparser.options, 'setsched', None))
    found_sched_names = [x.__name__ for x in found_sched]

    if optionparser.options.showsched:
        fancylogger.setLogLevelInfo()
        optionparser.log.info("Found Sched classes %s",
                              ", ".join(found_sched_names))
        return

    if mpi is None:
        optionparser.log.raiseException((
            "No MPI class found that supports scriptname %s; isfake %s). Please use "
            "mympirun through one of the direct calls or make sure the mpirun command can"
            " be found. Found MPI %s") % (scriptname, isfake,
                                          ", ".join(found_mpi_names)))
    else:
        optionparser.log.debug("Found MPI class %s (scriptname %s; isfake %s)",
                               mpi.__name__, scriptname, isfake)

    if sched is None:
        optionparser.log.raiseException(
            "No sched class found (options.setsched %s ; found Sched classes %s)",
            optionparser.options.setsched, ", ".join(found_sched_names))
    else:
        optionparser.log.debug(
            "Found sched class %s from options.setsched %s (all Sched found %s)",
            sched.__name__, optionparser.options.setsched,
            ", ".join(found_sched_names))

    if not optionparser.args:
        optionparser.log.warn("no mpi script provided")
        return

    return mpi, sched, optionparser