Example #1
0
def findLostJobs():
    """ Find jobs that are in active state (running) but
    torque server does know about them (mark them as lost).
    """
    for dead_server in BatchServer.objects.filter(isactive=False):
        for job in Job.objects.filter(job_state=getJobState('R'), server=dead_server):
            job.job_state = getJobState('L')
            job.save()
            log(LOG_DEBUG, "Running job on inactive server marked Lost: %s@%s" % (job.jobid,job.server.name))
    
    for live_server in BatchServer.objects.filter(isactive=True):
        p = subprocess.Popen(["qstat", "-fx", "@%s" % live_server.name], stdout=subprocess.PIPE)
        (out,err) = p.communicate()
        log(LOG_DEBUG, "Qstat output from live server %s obtained" % (live_server.name))
        try:
            log(LOG_DEBUG, "before parseString()")
            jobsxml = parseString(out)
            log(LOG_DEBUG, "after parseString()")
            starttime = time.time()
            log(LOG_DEBUG, "before feedJobsXML()")
            feedJobsXML(jobsxml, True)
            log(LOG_DEBUG, "after feedJobsXML()")
            endtime = time.time()
            log(LOG_INFO, "feedJobsXML() took %f seconds" % (endtime-starttime))
        except ExpatError:
            log(LOG_ERROR, "Cannot parse line: %s" % (out))
Example #2
0
def main():
    usage_string = "%prog [-h] [-l LOGLEVEL] [-n FILE|-j FILE|-e FILE|-s FILE]|[-d DIR] [-r] [-m FILE] [-g FILE] [-t]"
    version_string = "%%prog %s" % VERSION

    opt_parser = OptionParser(usage=usage_string, version=version_string)
    opt_parser.add_option("-l", "--loglevel", type="int", dest="loglevel", default=LOG_WARNING,
        help="Log level (0-3). Default is 0, which means only errors")
    opt_parser.add_option("-n", "--nodexml", action="append", dest="nodexmlfile", metavar="FILE", 
        help="XML file with node data")
    opt_parser.add_option("-j", "--jobxml", action="append", dest="jobxmlfile", metavar="FILE", 
        help="XML file with job data")
    opt_parser.add_option("-e", "--eventfile", action="append", dest="eventfile", metavar="FILE", 
        help="Text file with event data in accounting log format")
    opt_parser.add_option("-s", "--serverfile", action="append", dest="serverfile", metavar="FILE", 
        help="Text file with server settings (basically output of qmgr `print server` command)")
    opt_parser.add_option("-d", "--daemon", dest="daemondir", metavar="DIR", 
        help="Run in deamon node and read torque accounting logs from DIR")
    opt_parser.add_option("-u", "--updaterj", action="store_true", dest="updateRJ", default=False, 
        help="Update cache table with running jobs from the main jobs table")
    opt_parser.add_option("-g", "--gridjobmap", action="append", dest="gridjobmapfiles", metavar="FILE",
        help="Parse grid-jobmap files so we can find out the grid user for a job")

    oneTimeGroup = OptionGroup(opt_parser, "Maintenance options",
        "Following options are/were handy for one-time fixes in database.")

    opt_parser.add_option_group(oneTimeGroup)

    (options, args) = opt_parser.parse_args()

    if len(args)!=0:
        opt_parser.error("Too many arguments")

    Configuration['loglevel'] = options.loglevel

    if (options.findlostjobs):
        maintenance.findLostJobs()
        return

    if (options.deletedjobs):
        maintenance.findDeletedJobs()
        return

    if (options.runevents):
        maintenance.checkEventsRunningJobs()
        return

    if (options.gridjobmapfiles):
        for i in options.gridjobmapfiles:
            log(LOG_DEBUG, "Grid job map data will be read from file: %s" % i)
            processGridJobMap(openfile(i))
        return

    if (options.updateRJ):
        refreshRunningJobs()
        return

    if (options.removeall):
        maintenance.removeContent()
        return

    if (options.mergenodesfile):
        for i in options.mergenodesfile:
            log(LOG_DEBUG, "Nodes merge data will be read from file: %s" % i)
            maintenance.mergeNodes(openfile(i))
        return

    if (options.mergeusersfile):
        for i in options.mergeusersfile:
            log(LOG_DEBUG, "Users merge data will be read from file: %s" % i)
            maintenance.mergeUsers(openfile(i))
        return

    if (options.mergegroupsfile):
        for i in options.mergegroupsfile:
            log(LOG_DEBUG, "groups merge data will be read from file: %s" % i)
            maintenance.mergeGroups(openfile(i))
        return

    # invalid combinations
    if (options.nodexmlfile or options.jobxmlfile or options.eventfile or options.serverfile) and options.daemondir:
        opt_parser.error("You cannot run as daemon and process data files at once. Choose only one mode of running.")
    if not (options.nodexmlfile or options.jobxmlfile or options.eventfile or options.serverfile or options.daemondir):
        opt_parser.error("Mode of running is missing. Please specify one of -n, -j -e -s or -d.")

    if options.eventfile:
        for i in options.eventfile:
            log(LOG_DEBUG, "opening file %s" % i)
            feedJobsLog(openfile(i))
        
    if options.nodexmlfile:
        for i in options.nodexmlfile:
            nodesxml = parse(openfile(i))
            feedNodesXML(nodesxml)

    if options.jobxmlfile:
        for i in options.jobxmlfile:
            jobsxml = parse(openfile(i))
            feedJobsXML(jobsxml)
    
    if options.serverfile:
        log(LOG_ERROR, "Server file parsing is not supported yet")
        sys.exit(-1)

    if options.daemondir:
        runAsDaemon(options.daemondir)