Example #1
0
 def test_parser(self):
     """Test the showq parsers"""
     sq = Showq('clusters')
     master = 'master19.golett.gent.vsc'
     showq = sq.parser(master, SHOWQ_JOBS)
     sq.jobctl = True
     jobctl = sq.parser(master, JOBCTL_JOBS)
     self.assertEqual(showq, jobctl, msg='showq and jobctl commands give same parsed result')
Example #2
0
def process_hold(clusters, dry_run=False):
    """Process a filtered queueinfo dict"""
    releasejob_cache = FileCache(RELEASEJOB_CACHE_FILE)

    # get the showq data
    for data in clusters.values():
        data['path'] = data['spath']  # showq path
    showq = Showq(clusters, cache_pickle=True)
    (queue_information, _, _) = showq.get_moab_command_information()

    # release the jobs, prepare the command
    m = MoabCommand(cache_pickle=False, dry_run=dry_run)
    for data in clusters.values():
        data['path'] = data['mpath']  # mjobctl path
    m.clusters = clusters

    # read the previous data
    ts_data = releasejob_cache.load('queue_information')
    if ts_data is None:
        old_queue_information = {}
    else:
        (_, old_queue_information) = ts_data

    stats = {
        'peruser': 0,
        'total': 0,
        'release': 0,
    }

    release_jobids = []

    for user, clusterdata in queue_information.items():
        oldclusterdata = old_queue_information.setdefault(user, {})
        totaluser = 0
        for cluster, data in clusterdata.items():
            olddata = oldclusterdata.setdefault(cluster, {})
            # DRMJID is supposed to be unique
            # get all oldjobids in one dict
            oldjobs = dict([(j['DRMJID'], j['_release']) for jt in olddata.values() for j in jt])
            for jobtype, jobs in data.items():
                removeids = []
                for idx, job in enumerate(jobs):
                    jid = job['DRMJID']

                    if jobtype in RELEASEJOB_SUPPORTED_HOLDTYPES:
                        totaluser += 1
                        release = max(oldjobs.get(jid, 0), 0) + 1
                        job['_release'] = release
                        stats['release'] = max(stats['release'], release)
                        release_jobids.append(jid)
                        # release the job
                        cmd = [m.clusters[cluster]['path'], '-u', jid]
                        logger.info("Releasing job %s cluster %s for the %s-th time." % (jid, cluster, release))
                        if dry_run:
                            logger.info("Dry run %s" % cmd)
                        else:
                            m._run_moab_command(cmd, cluster, [])
                    else:
                        # keep historical data, eg a previously released job could be idle now
                        # but keep the counter in case it gets held again
                        try:
                            release = oldjobs[jid]
                            job['_release'] = release
                        except KeyError:
                            # not previously in hold, remove it
                            removeids.append(idx)

                # remove the jobs (in reverse order)
                for remove_idx in removeids[::-1]:
                    jobs.pop(remove_idx)

                # cleanup
                if len(jobs) == 0:
                    data.pop(jobtype)
            # cleanup
            if len(data) == 0:
                clusterdata.pop(cluster)
        # cleanup
        if len(clusterdata) == 0:
            queue_information.pop(user)

        # update stats
        stats['peruser'] = max(stats['peruser'], totaluser)
        stats['total'] += totaluser

    logger.info("Release statistics: total jobs in hold %(total)s; max in hold per user %(peruser)s; max releases per job %(release)s" % stats)

    # update and close
    releasejob_cache.update('queue_information', queue_information, 0)
    releasejob_cache.close()

    return release_jobids, stats
Example #3
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        "nagios": ("print out nagion information", None, "store_true", False, "n"),
        "nagios_check_filename": (
            "filename of where the nagios check data is stored",
            str,
            "store",
            NAGIOS_CHECK_FILENAME,
        ),
        "nagios_check_interval_threshold": (
            "threshold of nagios checks timing out",
            None,
            "store",
            NAGIOS_CHECK_INTERVAL_THRESHOLD,
        ),
        "hosts": ("the hosts/clusters that should be contacted for job information", None, "extend", []),
        "information": ("the sort of information to store: user, vo, project", None, "store", "user"),
        "location": ("the location for storing the pickle file: gengar, muk", str, "store", "gengar"),
        "ha": ("high-availability master IP address", None, "store", None),
        "dry-run": ("do not make any updates whatsoever", None, "store_true", False),
    }

    opts = simple_option(options)

    if opts.options.debug:
        fancylogger.setLogLevelDebug()

    nagios_reporter = NagiosReporter(NAGIOS_HEADER, NAGIOS_CHECK_FILENAME, NAGIOS_CHECK_INTERVAL_THRESHOLD)
    if opts.options.nagios:
        logger.debug("Producing Nagios report and exiting.")
        nagios_reporter.report_and_exit()
        sys.exit(0)  # not reached

    if not proceed_on_ha_service(opts.options.ha):
        logger.warning("Not running on the target host in the HA setup. Stopping.")
        nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master."))
        sys.exit(NAGIOS_EXIT_WARNING)

    lockfile = TimestampedPidLockfile(DSHOWQ_LOCK_FILE)
    lock_or_bork(lockfile, nagios_reporter)

    logger.info("starting dshowq run")

    clusters = {}
    for host in opts.options.hosts:
        master = opts.configfile_parser.get(host, "master")
        showq_path = opts.configfile_parser.get(host, "showq_path")
        clusters[host] = {"master": master, "path": showq_path}

    showq = Showq(clusters, cache_pickle=True, dry_run=opts.options.dry_run)

    (queue_information, reported_hosts, failed_hosts) = showq.get_moab_command_information()
    timeinfo = time.time()

    active_users = queue_information.keys()

    logger.debug("Active users: %s" % (active_users))
    logger.debug("Queue information: %s" % (queue_information))

    # We need to determine which users should get an updated pickle. This depends on
    # - the active user set
    # - the information we want to provide on the cluster(set) where this script runs
    # At the same time, we need to determine the job information each user gets to see
    (target_users, target_queue_information, user_map) = determine_target_information(
        opts.options.information, active_users, queue_information
    )

    nagios_user_count = 0
    nagios_no_store = 0

    LdapQuery(VscConfiguration())

    for user in target_users:
        if not opts.options.dry_run:
            try:
                (path, store) = get_pickle_path(opts.options.location, user)
                user_queue_information = target_queue_information[user]
                user_queue_information["timeinfo"] = timeinfo
                store(user, path, (user_queue_information, user_map[user]))
                nagios_user_count += 1
            except (UserStorageError, FileStoreError, FileMoveError), err:
                logger.error("Could not store pickle file for user %s" % (user))
                nagios_no_store += 1
        else:
            logger.info(
                "Dry run, not actually storing data for user %s at path %s"
                % (user, get_pickle_path(opts.options.location, user)[0])
            )
            logger.debug("Dry run, queue information for user %s is %s" % (user, target_queue_information[user]))
Example #4
0
def process_hold(clusters, dry_run=False):
    """Process a filtered queueinfo dict"""
    releasejob_cache = FileCache(RELEASEJOB_CACHE_FILE)

    # get the showq data
    for data in clusters.values():
        data['path'] = data['spath']  # showq path
    showq = Showq(clusters, cache_pickle=True)
    (queue_information, _, _) = showq.get_moab_command_information()

    # release the jobs, prepare the command
    m = MoabCommand(cache_pickle=False, dry_run=dry_run)
    for data in clusters.values():
        data['path'] = data['mpath']  # mjobctl path
    m.clusters = clusters

    # read the previous data
    ts_data = releasejob_cache.load('queue_information')
    if ts_data is None:
        old_queue_information = {}
    else:
        (_, old_queue_information) = ts_data

    stats = {
        'peruser': 0,
        'total': 0,
        'release': 0,
    }

    release_jobids = []

    for user, clusterdata in queue_information.items():
        oldclusterdata = old_queue_information.setdefault(user, {})
        totaluser = 0
        for cluster, data in clusterdata.items():
            olddata = oldclusterdata.setdefault(cluster, {})
            # DRMJID is supposed to be unique
            # get all oldjobids in one dict
            oldjobs = dict([(j['DRMJID'], j['_release'])
                            for jt in olddata.values() for j in jt])
            for jobtype, jobs in data.items():
                removeids = []
                for idx, job in enumerate(jobs):
                    jid = job['DRMJID']

                    if jobtype in RELEASEJOB_SUPPORTED_HOLDTYPES:
                        totaluser += 1
                        release = max(oldjobs.get(jid, 0), 0) + 1
                        job['_release'] = release
                        stats['release'] = max(stats['release'], release)
                        release_jobids.append(jid)
                        # release the job
                        cmd = [m.clusters[cluster]['path'], '-u', jid]
                        logger.info(
                            "Releasing job %s cluster %s for the %s-th time." %
                            (jid, cluster, release))
                        if dry_run:
                            logger.info("Dry run %s" % cmd)
                        else:
                            m._run_moab_command(cmd, cluster, [])
                    else:
                        # keep historical data, eg a previously released job could be idle now
                        # but keep the counter in case it gets held again
                        try:
                            release = oldjobs[jid]
                            job['_release'] = release
                        except KeyError:
                            # not previously in hold, remove it
                            removeids.append(idx)

                # remove the jobs (in reverse order)
                for remove_idx in removeids[::-1]:
                    jobs.pop(remove_idx)

                # cleanup
                if len(jobs) == 0:
                    data.pop(jobtype)
            # cleanup
            if len(data) == 0:
                clusterdata.pop(cluster)
        # cleanup
        if len(clusterdata) == 0:
            queue_information.pop(user)

        # update stats
        stats['peruser'] = max(stats['peruser'], totaluser)
        stats['total'] += totaluser

    logger.info(
        "Release statistics: total jobs in hold %(total)s; max in hold per user %(peruser)s; max releases per job %(release)s"
        % stats)

    # update and close
    releasejob_cache.update('queue_information', queue_information, 0)
    releasejob_cache.close()

    return release_jobids, stats
Example #5
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []),
        'information': ('the sort of information to store: user, vo, project', None, 'store', 'user'),
        'location': ('the location for storing the pickle file: gengar, muk', str, 'store', 'gengar'),
    }

    opts = ExtendedSimpleOption(options)

    try:
        LdapQuery(VscConfiguration())

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            showq_path = opts.configfile_parser.get(host, "showq_path")
            clusters[host] = {
                'master': master,
                'path': showq_path
            }

        logger.debug("clusters = %s" % (clusters,))
        showq = Showq(clusters, cache_pickle=True, dry_run=opts.options.dry_run)

        logger.debug("Getting showq information ...")

        (queue_information, reported_hosts, failed_hosts) = showq.get_moab_command_information()
        timeinfo = time.time()

        active_users = queue_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Queue information: %s" % (queue_information))

        # We need to determine which users should get an updated pickle. This depends on
        # - the active user set
        # - the information we want to provide on the cluster(set) where this script runs
        # At the same time, we need to determine the job information each user gets to see
        (target_users, target_queue_information, user_map) = determine_target_information(opts.options.information,
                                                                                        active_users,
                                                                                        queue_information)

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in target_users:
            if not opts.options.dry_run:
                try:
                    (path, store) = get_pickle_path(opts.options.location, user)
                    user_queue_information = target_queue_information[user]
                    user_queue_information['timeinfo'] = timeinfo
                    store(user, path, (user_queue_information, user_map[user]))
                    nagios_user_count += 1
                except (UserStorageError, FileStoreError, FileMoveError), err:
                    logger.error("Could not store pickle file for user %s" % (user))
                    nagios_no_store += 1
            else:
                logger.info("Dry run, not actually storing data for user %s at path %s" % (user, get_pickle_path(opts.options.location, user)[0]))
                logger.debug("Dry run, queue information for user %s is %s" % (user, target_queue_information[user]))

        stats["store+users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL