class JobsManager(object): def __init__(self, defaults={} ): """ Constructur: @defaults: default options """ # Command line options parser = OptionParser(option_list=[ make_option("--processes", action="callback", callback=Load(), type="string", dest="processes", default={}, help="List of datasets to be analyzed"), make_option("--load", # special option to load whole configuaration from JSON action="callback",callback=Load(),dest="__opts__", type="string", help="load JSON file with configuration",metavar="CONFIG.json" ), make_option("-n","--njobs",dest="njobs",type="int",default=0, help="number of jobs to run"), make_option("-q","--queue",dest="queue",type="string",default=None, help="LSF queue to use. default: %default"), make_option("--sync-lsf",dest="asyncLsf",action="store_false",default=True, help="Run LSF jobs in sync mode (with -K). This will spawn one thread per job. Use only if you know what you are doing." " default: False"), make_option("--use-tarball",dest="use_tarball",action="store_true",default=True, help="Make a sandbox tarball for the task default: %default"), make_option("--no-use-tarball",dest="useTarball",action="store_false",default=True, help="Do not make a sandbox tarball for the task."), make_option("--stage-to",dest="stageTo",action="store",default=None,type="string", help="Stage output to folder. default: %default"), make_option("--stage-cmd",dest="stageCmd",action="store",default="guess",type="string", help="Stage out command. (use 'guess' to have the script guessing the command from the output folder) default : %default"), make_option("--summary",dest="summary",action="store_true",default=False, help="Print jobs summary and exit"), make_option("-o","--output",dest="output",type="string", default="output.root", help="output file name. default: %default"), make_option("-d","--outputDir",dest="outputDir",type="string", default=None, help="output folder. default: %default"), make_option("-x","--jobEx",dest="jobExe",type="string", default=None, help="job executable. default: %default"), make_option("-c","--cmdLine",dest="cmdLine",type="string", default=None, help="job command line. The script arguments will be prepended. default: %default"), make_option("--dumpCfg", action="store_true", default=False, help="dump configuaration and exit. default: %default"), make_option("-v","--verbose", action="store_true", dest="verbose", default=False, help="default: %default"), make_option("-m","--max-resubmissions",dest="maxResub", type="int",default=3), make_option("-N","--ncpu",dest="ncpu", type="int",default=cpu_count()), make_option("-H","--hadd",dest="hadd",default=False, action="store_true", help="hadd output files when all jobs are finished." ), make_option("-D","--hadd-dateset",dest="hadd_dataset",default=False, action="store_true", help="hadd output per dataset when all jobs are finished." ), make_option("-P","--hadd-process",dest="hadd_process",default=False, action="store_true", help="hadd output per process when all jobs are finished." ), make_option("--dry-run",dest="dry_run",default=False, action="store_true", help="do not actually run the jobs." ), make_option("-C","--cont",dest="cont",default=False, action="store_true", help="continue interrupted task." ), make_option("-b","--batch-system",dest="batchSystem",type="string", default="auto",help="Batch system name. Currently supported: sge lsf, default: %default" ), ] ) # parse the command line (self.options, self.args) = parser.parse_args() self.maxResub = self.options.maxResub if self.options.cmdLine: self.args = self.args+shell_args(str(self.options.cmdLine)) if self.options.jobExe: self.options.jobExe = shell_expand(self.options.jobExe) if not self.args[0] == self.options.jobExe: self.args = [self.options.jobExe]+self.args self.uniqueNames = {} # ------------------------------------------------------------------------------------------------------------------- def __call__(self): """ __call__ Run all jobs. """ if self.options.summary: self.options.dry_run = True self.options.cont = True self.jobFactory = TarballJobFactory(self.options.stageTo,self.options.stageCmd,job_outdir=self.options.outputDir, batchSystem=self.options.batchSystem) self.parallel = Parallel(self.options.ncpu,lsfQueue=self.options.queue,lsfJobName="%s/runJobs" % self.options.outputDir, asyncLsf=self.options.asyncLsf,jobDriver=self.jobFactory,batchSystem=self.options.batchSystem) self.jobs = None if self.options.cont: if self.options.asyncLsf: self.loadLsfMon() else: self.firstRun() self.monitor() self.parallel.stop() # ------------------------------------------------------------------------------------------------------------------- def loadLsfMon(self): with open("%s/task_config.json" % (self.options.outputDir), "r" ) as cfin: task_config = json.loads(cfin.read()) jobs = task_config["jobs"] if self.options.useTarball: if not "tarball" in task_config: print print "You asked to run the jobs using a sandbox tarball, but the tarball name was not found in the task configuration" print " If you specified the --use-tarball now but not in the original submission, please remove it." print " Otherwise the task configuration may have been corrupted." print sys.exit(-1) self.jobFactory.setTarball(task_config["tarball"]) if not self.options.stageTo: self.jobFactory.stageDest( os.path.abspath(self.options.outputDir) ) self.parallel.setJobId(task_config.get("last_job_id",1)) for job in jobs: cmd, args, outfile, nsub, ret, batchId = job if type(batchId) == tuple or type(batchId) == list: jobName,batchId = batchId else: jobName=None if ret != 0 and nsub <= self.options.maxResub: self.parallel.addJob(cmd,args,batchId,jobName) # ------------------------------------------------------------------------------------------------------------------- def firstRun(self): (options,args) = (self.options, self.args) parallel = self.parallel task_config = {} outputPfx = options.output.replace(".root","") if not options.outputDir: sys.exit("\nPlease specify an output folder using the -d option\n") if options.dumpCfg: print ( dumpCfg(options) ) sys.exit(0) if not os.path.exists(options.outputDir): os.mkdir(options.outputDir) outputPfx = "%s/%s" % ( options.outputDir, outputPfx ) args.append("processIdMap=%s/config.json" % os.path.abspath(options.outputDir)) pset = args[0] if not options.jobExe else args[1] with open(pset,"r") as pin: with open("%s/%s" % ( options.outputDir, os.path.basename(pset) ), "w+" ) as pout: pout.write(pin.read()) pout.close() if not options.jobExe: os.chmod( "%s/%s" % ( options.outputDir, os.path.basename(pset)), 0755 ) pin.close() pset = "%s/%s" % ( options.outputDir, os.path.basename(pset) ) pset = os.path.abspath(pset) if options.useTarball: apset = os.path.abspath(pset) self.jobFactory.mkTarball("%s/sandbox.tgz" % os.path.abspath(options.outputDir), tarball_entries=[apset,"python","lib","bin"],tarball_patterns={"src/*":"data"}, tarball_transform="'s,%s,pset.py,'" % (apset.lstrip("/")) ) if not options.queue: print "\nWARNING: You specified the --use-tarball option but no batch queue. The tarball was created but the jobs won't actually use it." print " To avoid this printout run with --no-use-tarball or specify a batch queue using the --queue option.\n" options.useTarball = False task_config["tarball"] = self.jobFactory.tarball if not options.stageTo: self.jobFactory.stageDest( os.path.abspath(options.outputDir) ) options.stageTo = os.path.abspath(options.outputDir) print "\nWill stage output to %s using the command '%s'\n" % ( self.jobFactory.stage_dest, self.jobFactory.getStageCmd() ) if options.jobExe: args[1] = pset else: args[0] = pset with open("%s/config.json" % (options.outputDir), "w+" ) as fout: fout.write( dumpCfg(options,skip=["dry_run","summary"]) ) # store cmdLine options.cmdLine = str(" ".join(args)) outfiles = [] doutfiles = {} poutfiles = {} jobs = [] for name,datasets in options.processes.iteritems(): poutfiles[name] = ( "%s_%s.root" % ( outputPfx,name), [] ) for dset in datasets: job = args[0] if self.options.jobExe: pyjob = "" else: pyjob = job if type(dset) == list: dset,dopts = dset else: dopts = {} jobargs = copy(args[1:]) dsetName = dset.lstrip("/").replace("/","_") dsetName = self.getUniqueName(dsetName) outfile = "%s_%s.root" % ( outputPfx, dsetName ) doutfiles[dsetName] = ( str(outfile),[] ) jobargs.extend( ["dataset=%s" % dset, "outputFile=%s" % outfile ] ) # add (and replace) per-dataset job arguments dargs = dopts.get("args",[]) if type(dargs) != list: print "\nERROR : dataset-specific arguments should be list not %s" % (type(dargs)) print " dataset %s" % dset sys.exit(-1) if len(dargs) > 0: replace = {} for arg in dargs: aname,val = arg.split("=") replace[aname] = arg newargs = [] anames = [] for arg in jobargs: if not "=" in arg: newargs.append(arg) continue aname,val = arg.split("=") if aname in replace: newargs.append( replace.pop(aname) ) else: newargs.append(arg) jobargs = newargs for aname,arg in replace.iteritems(): jobargs.append(arg) print "running: %s %s" % ( job, " ".join(jobargs) ) njobs = dopts.get("njobs",options.njobs) if options.njobs != 0 else 0 if njobs != 0: print "splitting in (up to) %d jobs\n checking how many are needed... " % njobs, dnjobs = 0 dargs = jobargs+shell_args("nJobs=%d" % (njobs)) ret,out = parallel.run("python %s" % pyjob,dargs+shell_args("dryRun=1 getMaxJobs=1 dumpPython=%s.py" % os.path.join(options.outputDir,dsetName) ),interactive=True)[2] maxJobs = self.getMaxJobs(out) print maxJobs if maxJobs < 0: print "Error getting number of jobs to be submitted" print out hadd = self.getHadd(out,outfile) print " now submitting jobs", for ijob in range(maxJobs): ## FIXME allow specific job selection iargs = jobargs+shell_args("nJobs=%d jobId=%d" % (maxJobs, ijob)) dnjobs += 1 batchId = -1 if not options.dry_run: ret,out = parallel.run(job,iargs)[-1] if self.options.queue and self.options.asyncLsf: batchId = out[1] print ".", output = hadd.replace(".root","_%d.root" % ijob) outfiles.append( output ) doutfiles[dsetName][1].append( outfiles[-1] ) poutfiles[name][1].append( outfiles[-1] ) jobs.append( (job,iargs,output,0,-1,batchId) ) print "\n %d jobs submitted" % dnjobs else: ret,out = parallel.run("python %s" % pyjob,jobargs+shell_args("dryRun=1 dumpPython=%s.py" % os.path.join(options.outputDir,dsetName)),interactive=True)[2] if ret != 0: print ret,out continue output = self.getHadd(out,outfile) batchId = -1 if not options.dry_run: ret,out = parallel.run(job,jobargs)[-1] if self.options.queue and self.options.asyncLsf: batchId = out[1] outfiles.append( output ) jobs.append( (job,jobargs,output,0,-1,batchId) ) poutfiles[name][1].append( outfiles[-1] ) print task_config["jobs"] = jobs task_config["datasets_output"] = doutfiles task_config["process_output"] = poutfiles task_config["output"] = outfiles task_config["outputPfx"] = outputPfx self.storeTaskConfig(task_config) # ------------------------------------------------------------------------------------------------------------------- def storeTaskConfig(self,task_config): with open("%s/task_config.json" % (self.options.outputDir), "w+" ) as cfout: task_config["last_job_id"] = self.parallel.currJobId() cfout.write( json.dumps(task_config,indent=4) ) cfout.close() # ------------------------------------------------------------------------------------------------------------------- def getUniqueName(self,basename): if basename in self.uniqueNames: self.uniqueNames[basename] += 1 else: self.uniqueNames[basename] = 0 return basename return "%s%d" % (basename,self.uniqueNames[basename]) # ------------------------------------------------------------------------------------------------------------------- def monitor(self): (options,args) = (self.options, self.args) parallel = self.parallel with open("%s/task_config.json" % (options.outputDir), "r" ) as cfin: task_config = json.loads(cfin.read()) doutfiles = task_config["datasets_output"] poutfiles = task_config["process_output"] outfiles = task_config["output"] outputPfx = task_config["outputPfx"] self.task_config = task_config if options.summary: self.printSummary() return if not options.dry_run: ## FIXME: job resubmission returns = self.wait(parallel,self) if options.hadd: print "All jobs finished. Merging output." p = Parallel(options.ncpu) hadd = "hadd -f " if options.hadd_process: for proc,out in poutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd, outfile), outfiles ) if options.hadd_dataset: if options.hadd_process: hadd += " -T" for dset,out in doutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd,outfile), outfiles) if not (options.hadd_process or options.hadd_dataset): p.run("%s %s.root" % (hadd,outputPfx), outfiles) self.wait(p) self.storeTaskConfig(task_config) self.parallel.stop() # ------------------------------------------------------------------------------------------------------------------- def wait(self,parallel,handler=None): return parallel.wait(handler) # ------------------------------------------------------------------------------------------------------------------- def handleJobOutput(self,job,jobargs,ret): print "------------" print "Job finished: (exit code %d) '%s' '%s'" % ( ret[0], job, " ".join(jobargs) ) print "Job output: " print lines = ret[1].split("\n") if self.options.queue and self.options.asyncLsf: lines = lines[-10:] for line in lines: print line print jobargs = shell_args(" ".join(jobargs)) job = jobargs[0] jobargs = jobargs[1:] for ijob in self.task_config["jobs"]: inam,iargs = ijob[0:2] if inam == job and iargs == jobargs: ijob[4] = ret[0] if ret[0] != 0: print "" print "Job failed. Number of resubmissions: %d / %d. " % (ijob[3], self.maxResub), if ijob[3] < self.maxResub: print "Resubmitting." ijob[3] += 1 if ijob[3] == self.maxResub: iargs.append("lastAttempt=1") jobName = ijob[5][0] if self.options.queue else None out = self.parallel.run(inam,iargs,jobName=jobName) if self.options.queue and self.options.asyncLsf: ijob[5] = out[-1][1][1] self.storeTaskConfig(self.task_config) print "------------" return 1 else: print "Giving up." self.storeTaskConfig(self.task_config) print "------------" return 0 # ------------------------------------------------------------------------------------------------------------------- def getHadd(self,stg,fallback): for line in stg.split("\n"): if line.startswith("hadd:"): return line.replace("hadd:","") return fallback # ------------------------------------------------------------------------------------------------------------------- def getMaxJobs(self,stg): for line in stg.split("\n"): if line.startswith("maxJobs:"): return int(line.replace("maxJobs:","")) return -1 # ------------------------------------------------------------------------------------------------------------------- def printSummary(self): jobs = self.task_config["jobs"] procs = self.task_config["datasets_output"] status = {} for job in jobs: cmd, args, outfile, nsub, ret, batchId = job status[outfile] = (nsub,ret) for proc,out in procs.iteritems(): outfile,outfiles = out finished = [] missing = {} for jfile in outfiles: nsub,ret = status[jfile] if ret != 0: if not nsub in missing: missing[nsub] = [] missing[nsub].append( jfile ) else: finished.append(jfile) print "----------" print "process: %s " % outfile.replace(".root","") print "njobs: %d " % len(outfiles) print "finished: %d " % len(finished) for nsub,lst in missing.iteritems(): print "submitted %d times: %d" % (nsub+1, len(lst)) print