def findLostJobs(): """ Find jobs that are in active state (running) but torque server does know about them (mark them as lost). """ for dead_server in BatchServer.objects.filter(isactive=False): for job in Job.objects.filter(job_state=getJobState('R'), server=dead_server): job.job_state = getJobState('L') job.save() log(LOG_DEBUG, "Running job on inactive server marked Lost: %s@%s" % (job.jobid,job.server.name)) for live_server in BatchServer.objects.filter(isactive=True): p = subprocess.Popen(["qstat", "-fx", "@%s" % live_server.name], stdout=subprocess.PIPE) (out,err) = p.communicate() log(LOG_DEBUG, "Qstat output from live server %s obtained" % (live_server.name)) try: log(LOG_DEBUG, "before parseString()") jobsxml = parseString(out) log(LOG_DEBUG, "after parseString()") starttime = time.time() log(LOG_DEBUG, "before feedJobsXML()") feedJobsXML(jobsxml, True) log(LOG_DEBUG, "after feedJobsXML()") endtime = time.time() log(LOG_INFO, "feedJobsXML() took %f seconds" % (endtime-starttime)) except ExpatError: log(LOG_ERROR, "Cannot parse line: %s" % (out))
def main(): usage_string = "%prog [-h] [-l LOGLEVEL] [-n FILE|-j FILE|-e FILE|-s FILE]|[-d DIR] [-r] [-m FILE] [-g FILE] [-t]" version_string = "%%prog %s" % VERSION opt_parser = OptionParser(usage=usage_string, version=version_string) opt_parser.add_option("-l", "--loglevel", type="int", dest="loglevel", default=LOG_WARNING, help="Log level (0-3). Default is 0, which means only errors") opt_parser.add_option("-n", "--nodexml", action="append", dest="nodexmlfile", metavar="FILE", help="XML file with node data") opt_parser.add_option("-j", "--jobxml", action="append", dest="jobxmlfile", metavar="FILE", help="XML file with job data") opt_parser.add_option("-e", "--eventfile", action="append", dest="eventfile", metavar="FILE", help="Text file with event data in accounting log format") opt_parser.add_option("-s", "--serverfile", action="append", dest="serverfile", metavar="FILE", help="Text file with server settings (basically output of qmgr `print server` command)") opt_parser.add_option("-d", "--daemon", dest="daemondir", metavar="DIR", help="Run in deamon node and read torque accounting logs from DIR") opt_parser.add_option("-u", "--updaterj", action="store_true", dest="updateRJ", default=False, help="Update cache table with running jobs from the main jobs table") opt_parser.add_option("-g", "--gridjobmap", action="append", dest="gridjobmapfiles", metavar="FILE", help="Parse grid-jobmap files so we can find out the grid user for a job") oneTimeGroup = OptionGroup(opt_parser, "Maintenance options", "Following options are/were handy for one-time fixes in database.") opt_parser.add_option_group(oneTimeGroup) (options, args) = opt_parser.parse_args() if len(args)!=0: opt_parser.error("Too many arguments") Configuration['loglevel'] = options.loglevel if (options.findlostjobs): maintenance.findLostJobs() return if (options.deletedjobs): maintenance.findDeletedJobs() return if (options.runevents): maintenance.checkEventsRunningJobs() return if (options.gridjobmapfiles): for i in options.gridjobmapfiles: log(LOG_DEBUG, "Grid job map data will be read from file: %s" % i) processGridJobMap(openfile(i)) return if (options.updateRJ): refreshRunningJobs() return if (options.removeall): maintenance.removeContent() return if (options.mergenodesfile): for i in options.mergenodesfile: log(LOG_DEBUG, "Nodes merge data will be read from file: %s" % i) maintenance.mergeNodes(openfile(i)) return if (options.mergeusersfile): for i in options.mergeusersfile: log(LOG_DEBUG, "Users merge data will be read from file: %s" % i) maintenance.mergeUsers(openfile(i)) return if (options.mergegroupsfile): for i in options.mergegroupsfile: log(LOG_DEBUG, "groups merge data will be read from file: %s" % i) maintenance.mergeGroups(openfile(i)) return # invalid combinations if (options.nodexmlfile or options.jobxmlfile or options.eventfile or options.serverfile) and options.daemondir: opt_parser.error("You cannot run as daemon and process data files at once. Choose only one mode of running.") if not (options.nodexmlfile or options.jobxmlfile or options.eventfile or options.serverfile or options.daemondir): opt_parser.error("Mode of running is missing. Please specify one of -n, -j -e -s or -d.") if options.eventfile: for i in options.eventfile: log(LOG_DEBUG, "opening file %s" % i) feedJobsLog(openfile(i)) if options.nodexmlfile: for i in options.nodexmlfile: nodesxml = parse(openfile(i)) feedNodesXML(nodesxml) if options.jobxmlfile: for i in options.jobxmlfile: jobsxml = parse(openfile(i)) feedJobsXML(jobsxml) if options.serverfile: log(LOG_ERROR, "Server file parsing is not supported yet") sys.exit(-1) if options.daemondir: runAsDaemon(options.daemondir)