def monitor(self): (options,args) = (self.options, self.args) parallel = self.parallel with open("%s/task_config.json" % (options.outputDir), "r" ) as cfin: task_config = json.loads(cfin.read()) doutfiles = task_config["datasets_output"] poutfiles = task_config["process_output"] outfiles = task_config["output"] outputPfx = task_config["outputPfx"] self.task_config = task_config if options.summary: self.printSummary() return if not options.dry_run: ## FIXME: job resubmission returns = self.wait(parallel,self) if options.hadd: print "All jobs finished. Merging output." p = Parallel(options.ncpu) hadd = "hadd -f " if options.hadd_process: for proc,out in poutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd, outfile), outfiles ) if options.hadd_dataset: if options.hadd_process: hadd += " -T" for dset,out in doutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd,outfile), outfiles) if not (options.hadd_process or options.hadd_dataset): p.run("%s %s.root" % (hadd,outputPfx), outfiles) self.wait(p) self.storeTaskConfig(task_config) self.parallel.stop()
def monitor(self): (options,args) = (self.options, self.args) parallel = self.parallel with open("%s/task_config.json" % (options.outputDir), "r" ) as cfin: task_config = json.loads(cfin.read()) doutfiles = task_config["datasets_output"] poutfiles = task_config["process_output"] outfiles = task_config["output"] outputPfx = task_config["outputPfx"] if not options.dry_run: ## FIXME: job resubmission self.jobs = task_config["jobs"] returns = self.wait(parallel,self) task_config["jobs"] = self.jobs if options.hadd: print "All jobs finished. Merging output." p = Parallel(options.ncpu) hadd = "hadd -f " if options.hadd_process: for proc,out in poutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd, outfile), outfiles ) if options.hadd_dataset: if options.hadd_process: hadd += " -T" for dset,out in doutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd,outfile), outfiles) if not (options.hadd_process or options.hadd_dataset): p.run("%s %s.root" % (hadd,outputPfx), outfiles) self.wait(p) with open("%s/task_config.json" % (options.outputDir), "w+" ) as cfout: cfout.write( json.dumps(task_config,indent=4) ) cfout.close() self.parallel.stop()
def monitor(self): (options, args) = (self.options, self.args) parallel = self.parallel with open("%s/task_config.json" % (options.outputDir), "r") as cfin: task_config = json.loads(cfin.read()) doutfiles = task_config["datasets_output"] poutfiles = task_config["process_output"] outfiles = task_config["output"] outputPfx = task_config["outputPfx"] if not options.dry_run: ## FIXME: job resubmission self.jobs = task_config["jobs"] returns = self.wait(parallel, self) task_config["jobs"] = self.jobs if options.hadd: print "All jobs finished. Merging output." p = Parallel(options.ncpu) hadd = "hadd -f " if options.hadd_process: for proc, out in poutfiles.iteritems(): outfile, outfiles = out p.run("%s %s" % (hadd, outfile), outfiles) if options.hadd_dataset: if options.hadd_process: hadd += " -T" for dset, out in doutfiles.iteritems(): outfile, outfiles = out p.run("%s %s" % (hadd, outfile), outfiles) if not (options.hadd_process or options.hadd_dataset): p.run("%s %s.root" % (hadd, outputPfx), outfiles) self.wait(p) with open("%s/task_config.json" % (options.outputDir), "w+") as cfout: cfout.write(json.dumps(task_config, indent=4)) cfout.close() self.parallel.stop()
def run(self, genome_files, output_dir, called_genes=False, translation_table=None, meta=False, closed_ends=False): """Call genes with Prodigal. Call genes with prodigal and store the results in the specified output directory. For convenience, the called_gene flag can be used to indicate genes have previously been called and simply need to be copied to the specified output directory. Parameters ---------- genome_files : list of str Nucleotide fasta files to call genes on. called_genes : boolean Flag indicating if genes are already called. translation_table : int Specifies desired translation table, use None to automatically select between tables 4 and 11. meta : boolean Flag indicating if prodigal should call genes with the metagenomics procedure. closed_ends : boolean If True, do not allow genes to run off edges (throws -c flag). output_dir : str Directory to store called genes. Returns ------- d[genome_id] -> namedtuple(best_translation_table coding_density_4 coding_density_11) Summary statistics of called genes for each genome. """ self.called_genes = called_genes self.translation_table = translation_table self.meta = meta self.closed_ends = closed_ends self.output_dir = output_dir make_sure_path_exists(self.output_dir) progress_func = None if self.verbose: file_type = 'genomes' self.progress_str = ' Finished processing %d of %d (%.2f%%) genomes.' if meta: file_type = 'scaffolds' if len(genome_files): file_type = ntpath.basename(genome_files[0]) self.progress_str = ' Finished processing %d of %d (%.2f%%) files.' self.logger.info('Identifying genes within %s: ' % file_type) progress_func = self._progress parallel = Parallel(self.cpus) summary_stats = parallel.run( self._producer, self._consumer, genome_files, progress_func) # An error was encountered during Prodigal processing, clean up. if not summary_stats: shutil.rmtree(self.output_dir) return summary_stats
class JobsManager(object): def __init__(self, defaults={} ): """ Constructur: @defaults: default options """ # Command line options parser = OptionParser(option_list=[ make_option("--processes", action="callback", callback=Load(), type="string", dest="processes", default={}, help="List of datasets to be analyzed"), make_option("--load", # special option to load whole configuaration from JSON action="callback",callback=Load(),dest="__opts__", type="string", help="load JSON file with configuration",metavar="CONFIG.json" ), make_option("-n","--njobs",dest="njobs",type="int",default=0, help="number of jobs to run"), make_option("-q","--queue",dest="queue",type="string",default=None, help="LSF queue to use. default: %default"), make_option("--sync-lsf",dest="asyncLsf",action="store_false",default=True, help="Run LSF jobs in sync mode (with -K). This will spawn one thread per job. Use only if you know what you are doing." " default: False"), make_option("--use-tarball",dest="use_tarball",action="store_true",default=True, help="Make a sandbox tarball for the task default: %default"), make_option("--no-use-tarball",dest="useTarball",action="store_false",default=True, help="Do not make a sandbox tarball for the task."), make_option("--stage-to",dest="stageTo",action="store",default=None,type="string", help="Stage output to folder. default: %default"), make_option("--stage-cmd",dest="stageCmd",action="store",default="guess",type="string", help="Stage out command. (use 'guess' to have the script guessing the command from the output folder) default : %default"), make_option("--summary",dest="summary",action="store_true",default=False, help="Print jobs summary and exit"), make_option("-o","--output",dest="output",type="string", default="output.root", help="output file name. default: %default"), make_option("-d","--outputDir",dest="outputDir",type="string", default=None, help="output folder. default: %default"), make_option("-x","--jobEx",dest="jobExe",type="string", default=None, help="job executable. default: %default"), make_option("-c","--cmdLine",dest="cmdLine",type="string", default=None, help="job command line. The script arguments will be prepended. default: %default"), make_option("--dumpCfg", action="store_true", default=False, help="dump configuaration and exit. default: %default"), make_option("-v","--verbose", action="store_true", dest="verbose", default=False, help="default: %default"), make_option("-m","--max-resubmissions",dest="maxResub", type="int",default=3), make_option("-N","--ncpu",dest="ncpu", type="int",default=cpu_count()), make_option("-H","--hadd",dest="hadd",default=False, action="store_true", help="hadd output files when all jobs are finished." ), make_option("-D","--hadd-dateset",dest="hadd_dataset",default=False, action="store_true", help="hadd output per dataset when all jobs are finished." ), make_option("-P","--hadd-process",dest="hadd_process",default=False, action="store_true", help="hadd output per process when all jobs are finished." ), make_option("--dry-run",dest="dry_run",default=False, action="store_true", help="do not actually run the jobs." ), make_option("-C","--cont",dest="cont",default=False, action="store_true", help="continue interrupted task." ), make_option("-b","--batch-system",dest="batchSystem",type="string", default="auto",help="Batch system name. Currently supported: sge lsf, default: %default" ), ] ) # parse the command line (self.options, self.args) = parser.parse_args() self.maxResub = self.options.maxResub if self.options.cmdLine: self.args = self.args+shell_args(str(self.options.cmdLine)) if self.options.jobExe: self.options.jobExe = shell_expand(self.options.jobExe) if not self.args[0] == self.options.jobExe: self.args = [self.options.jobExe]+self.args self.uniqueNames = {} # ------------------------------------------------------------------------------------------------------------------- def __call__(self): """ __call__ Run all jobs. """ if self.options.summary: self.options.dry_run = True self.options.cont = True self.jobFactory = TarballJobFactory(self.options.stageTo,self.options.stageCmd,job_outdir=self.options.outputDir, batchSystem=self.options.batchSystem) self.parallel = Parallel(self.options.ncpu,lsfQueue=self.options.queue,lsfJobName="%s/runJobs" % self.options.outputDir, asyncLsf=self.options.asyncLsf,jobDriver=self.jobFactory,batchSystem=self.options.batchSystem) self.jobs = None if self.options.cont: if self.options.asyncLsf: self.loadLsfMon() else: self.firstRun() self.monitor() self.parallel.stop() # ------------------------------------------------------------------------------------------------------------------- def loadLsfMon(self): with open("%s/task_config.json" % (self.options.outputDir), "r" ) as cfin: task_config = json.loads(cfin.read()) jobs = task_config["jobs"] if self.options.useTarball: if not "tarball" in task_config: print print "You asked to run the jobs using a sandbox tarball, but the tarball name was not found in the task configuration" print " If you specified the --use-tarball now but not in the original submission, please remove it." print " Otherwise the task configuration may have been corrupted." print sys.exit(-1) self.jobFactory.setTarball(task_config["tarball"]) if not self.options.stageTo: self.jobFactory.stageDest( os.path.abspath(self.options.outputDir) ) self.parallel.setJobId(task_config.get("last_job_id",1)) for job in jobs: cmd, args, outfile, nsub, ret, batchId = job if type(batchId) == tuple or type(batchId) == list: jobName,batchId = batchId else: jobName=None if ret != 0 and nsub <= self.options.maxResub: self.parallel.addJob(cmd,args,batchId,jobName) # ------------------------------------------------------------------------------------------------------------------- def firstRun(self): (options,args) = (self.options, self.args) parallel = self.parallel task_config = {} outputPfx = options.output.replace(".root","") if not options.outputDir: sys.exit("\nPlease specify an output folder using the -d option\n") if options.dumpCfg: print ( dumpCfg(options) ) sys.exit(0) if not os.path.exists(options.outputDir): os.mkdir(options.outputDir) outputPfx = "%s/%s" % ( options.outputDir, outputPfx ) args.append("processIdMap=%s/config.json" % os.path.abspath(options.outputDir)) pset = args[0] if not options.jobExe else args[1] with open(pset,"r") as pin: with open("%s/%s" % ( options.outputDir, os.path.basename(pset) ), "w+" ) as pout: pout.write(pin.read()) pout.close() if not options.jobExe: os.chmod( "%s/%s" % ( options.outputDir, os.path.basename(pset)), 0755 ) pin.close() pset = "%s/%s" % ( options.outputDir, os.path.basename(pset) ) pset = os.path.abspath(pset) if options.useTarball: apset = os.path.abspath(pset) self.jobFactory.mkTarball("%s/sandbox.tgz" % os.path.abspath(options.outputDir), tarball_entries=[apset,"python","lib","bin"],tarball_patterns={"src/*":"data"}, tarball_transform="'s,%s,pset.py,'" % (apset.lstrip("/")) ) if not options.queue: print "\nWARNING: You specified the --use-tarball option but no batch queue. The tarball was created but the jobs won't actually use it." print " To avoid this printout run with --no-use-tarball or specify a batch queue using the --queue option.\n" options.useTarball = False task_config["tarball"] = self.jobFactory.tarball if not options.stageTo: self.jobFactory.stageDest( os.path.abspath(options.outputDir) ) options.stageTo = os.path.abspath(options.outputDir) print "\nWill stage output to %s using the command '%s'\n" % ( self.jobFactory.stage_dest, self.jobFactory.getStageCmd() ) if options.jobExe: args[1] = pset else: args[0] = pset with open("%s/config.json" % (options.outputDir), "w+" ) as fout: fout.write( dumpCfg(options,skip=["dry_run","summary"]) ) # store cmdLine options.cmdLine = str(" ".join(args)) outfiles = [] doutfiles = {} poutfiles = {} jobs = [] for name,datasets in options.processes.iteritems(): poutfiles[name] = ( "%s_%s.root" % ( outputPfx,name), [] ) for dset in datasets: job = args[0] if self.options.jobExe: pyjob = "" else: pyjob = job if type(dset) == list: dset,dopts = dset else: dopts = {} jobargs = copy(args[1:]) dsetName = dset.lstrip("/").replace("/","_") dsetName = self.getUniqueName(dsetName) outfile = "%s_%s.root" % ( outputPfx, dsetName ) doutfiles[dsetName] = ( str(outfile),[] ) jobargs.extend( ["dataset=%s" % dset, "outputFile=%s" % outfile ] ) # add (and replace) per-dataset job arguments dargs = dopts.get("args",[]) if type(dargs) != list: print "\nERROR : dataset-specific arguments should be list not %s" % (type(dargs)) print " dataset %s" % dset sys.exit(-1) if len(dargs) > 0: replace = {} for arg in dargs: aname,val = arg.split("=") replace[aname] = arg newargs = [] anames = [] for arg in jobargs: if not "=" in arg: newargs.append(arg) continue aname,val = arg.split("=") if aname in replace: newargs.append( replace.pop(aname) ) else: newargs.append(arg) jobargs = newargs for aname,arg in replace.iteritems(): jobargs.append(arg) print "running: %s %s" % ( job, " ".join(jobargs) ) njobs = dopts.get("njobs",options.njobs) if options.njobs != 0 else 0 if njobs != 0: print "splitting in (up to) %d jobs\n checking how many are needed... " % njobs, dnjobs = 0 dargs = jobargs+shell_args("nJobs=%d" % (njobs)) ret,out = parallel.run("python %s" % pyjob,dargs+shell_args("dryRun=1 getMaxJobs=1 dumpPython=%s.py" % os.path.join(options.outputDir,dsetName) ),interactive=True)[2] maxJobs = self.getMaxJobs(out) print maxJobs if maxJobs < 0: print "Error getting number of jobs to be submitted" print out hadd = self.getHadd(out,outfile) print " now submitting jobs", for ijob in range(maxJobs): ## FIXME allow specific job selection iargs = jobargs+shell_args("nJobs=%d jobId=%d" % (maxJobs, ijob)) dnjobs += 1 batchId = -1 if not options.dry_run: ret,out = parallel.run(job,iargs)[-1] if self.options.queue and self.options.asyncLsf: batchId = out[1] print ".", output = hadd.replace(".root","_%d.root" % ijob) outfiles.append( output ) doutfiles[dsetName][1].append( outfiles[-1] ) poutfiles[name][1].append( outfiles[-1] ) jobs.append( (job,iargs,output,0,-1,batchId) ) print "\n %d jobs submitted" % dnjobs else: ret,out = parallel.run("python %s" % pyjob,jobargs+shell_args("dryRun=1 dumpPython=%s.py" % os.path.join(options.outputDir,dsetName)),interactive=True)[2] if ret != 0: print ret,out continue output = self.getHadd(out,outfile) batchId = -1 if not options.dry_run: ret,out = parallel.run(job,jobargs)[-1] if self.options.queue and self.options.asyncLsf: batchId = out[1] outfiles.append( output ) jobs.append( (job,jobargs,output,0,-1,batchId) ) poutfiles[name][1].append( outfiles[-1] ) print task_config["jobs"] = jobs task_config["datasets_output"] = doutfiles task_config["process_output"] = poutfiles task_config["output"] = outfiles task_config["outputPfx"] = outputPfx self.storeTaskConfig(task_config) # ------------------------------------------------------------------------------------------------------------------- def storeTaskConfig(self,task_config): with open("%s/task_config.json" % (self.options.outputDir), "w+" ) as cfout: task_config["last_job_id"] = self.parallel.currJobId() cfout.write( json.dumps(task_config,indent=4) ) cfout.close() # ------------------------------------------------------------------------------------------------------------------- def getUniqueName(self,basename): if basename in self.uniqueNames: self.uniqueNames[basename] += 1 else: self.uniqueNames[basename] = 0 return basename return "%s%d" % (basename,self.uniqueNames[basename]) # ------------------------------------------------------------------------------------------------------------------- def monitor(self): (options,args) = (self.options, self.args) parallel = self.parallel with open("%s/task_config.json" % (options.outputDir), "r" ) as cfin: task_config = json.loads(cfin.read()) doutfiles = task_config["datasets_output"] poutfiles = task_config["process_output"] outfiles = task_config["output"] outputPfx = task_config["outputPfx"] self.task_config = task_config if options.summary: self.printSummary() return if not options.dry_run: ## FIXME: job resubmission returns = self.wait(parallel,self) if options.hadd: print "All jobs finished. Merging output." p = Parallel(options.ncpu) hadd = "hadd -f " if options.hadd_process: for proc,out in poutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd, outfile), outfiles ) if options.hadd_dataset: if options.hadd_process: hadd += " -T" for dset,out in doutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd,outfile), outfiles) if not (options.hadd_process or options.hadd_dataset): p.run("%s %s.root" % (hadd,outputPfx), outfiles) self.wait(p) self.storeTaskConfig(task_config) self.parallel.stop() # ------------------------------------------------------------------------------------------------------------------- def wait(self,parallel,handler=None): return parallel.wait(handler) # ------------------------------------------------------------------------------------------------------------------- def handleJobOutput(self,job,jobargs,ret): print "------------" print "Job finished: (exit code %d) '%s' '%s'" % ( ret[0], job, " ".join(jobargs) ) print "Job output: " print lines = ret[1].split("\n") if self.options.queue and self.options.asyncLsf: lines = lines[-10:] for line in lines: print line print jobargs = shell_args(" ".join(jobargs)) job = jobargs[0] jobargs = jobargs[1:] for ijob in self.task_config["jobs"]: inam,iargs = ijob[0:2] if inam == job and iargs == jobargs: ijob[4] = ret[0] if ret[0] != 0: print "" print "Job failed. Number of resubmissions: %d / %d. " % (ijob[3], self.maxResub), if ijob[3] < self.maxResub: print "Resubmitting." ijob[3] += 1 if ijob[3] == self.maxResub: iargs.append("lastAttempt=1") jobName = ijob[5][0] if self.options.queue else None out = self.parallel.run(inam,iargs,jobName=jobName) if self.options.queue and self.options.asyncLsf: ijob[5] = out[-1][1][1] self.storeTaskConfig(self.task_config) print "------------" return 1 else: print "Giving up." self.storeTaskConfig(self.task_config) print "------------" return 0 # ------------------------------------------------------------------------------------------------------------------- def getHadd(self,stg,fallback): for line in stg.split("\n"): if line.startswith("hadd:"): return line.replace("hadd:","") return fallback # ------------------------------------------------------------------------------------------------------------------- def getMaxJobs(self,stg): for line in stg.split("\n"): if line.startswith("maxJobs:"): return int(line.replace("maxJobs:","")) return -1 # ------------------------------------------------------------------------------------------------------------------- def printSummary(self): jobs = self.task_config["jobs"] procs = self.task_config["datasets_output"] status = {} for job in jobs: cmd, args, outfile, nsub, ret, batchId = job status[outfile] = (nsub,ret) for proc,out in procs.iteritems(): outfile,outfiles = out finished = [] missing = {} for jfile in outfiles: nsub,ret = status[jfile] if ret != 0: if not nsub in missing: missing[nsub] = [] missing[nsub].append( jfile ) else: finished.append(jfile) print "----------" print "process: %s " % outfile.replace(".root","") print "njobs: %d " % len(outfiles) print "finished: %d " % len(finished) for nsub,lst in missing.iteritems(): print "submitted %d times: %d" % (nsub+1, len(lst)) print
class JobsManager(object): def __init__(self, defaults={}): """ Constructur: @defaults: default options """ # Command line options parser = OptionParser(option_list=[ make_option("--processes", action="callback", callback=Load(), type="string", dest="processes", default={}, help="List of datasets to be analyzed"), make_option( "--load", # special option to load whole configuaration from JSON action="callback", callback=Load(), dest="__opts__", type="string", help="load JSON file with configuration", metavar="CONFIG.json"), make_option("-n", "--njobs", dest="njobs", type="int", default=0, help="number of jobs to run"), make_option("-q", "--queue", dest="queue", type="string", default=None, help="LSF queue to use. default: %default"), make_option("-o", "--output", dest="output", type="string", default="output.root", help="output file name. default: %default"), make_option("-d", "--outputDir", dest="outputDir", type="string", default=None, help="output folder. default: %default"), make_option("-x", "--jobEx", dest="jobExe", type="string", default=None, help="job executable. default: %default"), make_option( "-c", "--cmdLine", dest="cmdLine", type="string", default=None, help= "job command line. The script arguments will be prepended. default: %default" ), make_option( "--dumpCfg", action="store_true", default=False, help="dump configuaration and exit. default: %default"), make_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="default: %default"), make_option("-m", "--max-resubmissions", dest="maxResub", type="int", default=3), make_option( "-N", "--ncpu", dest="ncpu", type="int", default=cpu_count()), make_option("-H", "--hadd", dest="hadd", default=False, action="store_true", help="hadd output files when all jobs are finished."), make_option( "-D", "--hadd-dateset", dest="hadd_dataset", default=False, action="store_true", help="hadd output per dataset when all jobs are finished."), make_option( "-P", "--hadd-process", dest="hadd_process", default=False, action="store_true", help="hadd output per process when all jobs are finished."), make_option("--dry-run", dest="dry_run", default=False, action="store_true", help="do not actually run the jobs."), make_option("-C", "--cont", dest="cont", default=False, action="store_true", help="continue interrupted task."), ]) # parse the command line (self.options, self.args) = parser.parse_args() self.maxResub = self.options.maxResub if self.options.cmdLine: self.args = self.args + shell_args(str(self.options.cmdLine)) if self.options.jobExe: self.args = [shell_expand(self.options.jobExe)] + self.args def __call__(self): """ __call__ Run all jobs. """ self.parallel = Parallel(self.options.ncpu, lsfQueue=self.options.queue, lsfJobName="%s/runJobs" % self.options.outputDir, asyncLsf=False) self.jobs = None if self.options.cont: pass else: self.firstRun() self.monitor() def firstRun(self): (options, args) = (self.options, self.args) parallel = self.parallel outputPfx = options.output.replace(".root", "") if not options.outputDir: sys.exit("Please specify an output folder") if options.dumpCfg: print(dumpCfg(options)) sys.exit(0) if not os.path.exists(options.outputDir): os.mkdir(options.outputDir) outputPfx = "%s/%s" % (options.outputDir, outputPfx) args.append("processIdMap=%s/config.json" % options.outputDir) ## options.cmdLine += " %s" % (" ".join(args)) options.cmdLine = str(" ".join(args)) with open("%s/config.json" % (options.outputDir), "w+") as fout: fout.write(dumpCfg(options)) outfiles = [] doutfiles = {} poutfiles = {} jobs = [] for name, datasets in options.processes.iteritems(): poutfiles[name] = ("%s_%s.root" % (outputPfx, name), []) for dset in datasets: job = args[0] if self.options.jobExe: pyjob = "" else: pyjob = job jobargs = copy(args[1:]) dsetName = dset.lstrip("/").replace("/", "_") outfile = "%s_%s.root" % (outputPfx, dsetName) doutfiles[dset] = (str(outfile), []) jobargs.extend( ["dataset=%s" % dset, "outputFile=%s" % outfile]) print "running: %s %s" % (job, " ".join(jobargs)) if options.njobs != 0: print "splitting in (up to) %d jobs\n checking how many are needed... " % options.njobs dnjobs = 0 dargs = jobargs + shell_args("nJobs=%d" % (options.njobs)) ret, out = parallel.run( "python %s" % pyjob, dargs + shell_args("dryRun=1 getMaxJobs=1 dumpPython=%s.py" % os.path.join(options.outputDir, dsetName)), interactive=True)[2] maxJobs = self.getMaxJobs(out) if maxJobs < 0: print "Error getting numer of jobs to be submitted" print out hadd = self.getHadd(out, outfile) ## for ijob in range(options.njobs): for ijob in range(maxJobs): ## FIXME allow specific job selection ## iargs = dargs+shell_args("jobId=%d" % (ijob)) iargs = jobargs + shell_args("nJobs=%d jobId=%d" % (maxJobs, ijob)) ## # run python <command-line> dryRun=1 to check if the job needs to be run ## ret,out = parallel.run("python %s" % pyjob,iargs+shell_args("dryRun=1"),interactive=True)[2] ## if ret != 0: ## continue dnjobs += 1 if not options.dry_run: ## FIXME: ## - handle output ## - store log files parallel.run(job, iargs) ## outfiles.append( outfile.replace(".root","_%d.root" % ijob) ) ## output = self.getHadd(out,outfile.replace(".root","_%d.root" % ijob)) output = hadd.replace(".root", "_%d.root" % ijob) outfiles.append(output) doutfiles[dset][1].append(outfiles[-1]) poutfiles[name][1].append(outfiles[-1]) jobs.append((job, iargs, output, 0, -1)) print " %d jobs actually submitted" % dnjobs else: ret, out = parallel.run( "python %s" % pyjob, jobargs + shell_args("dryRun=1 dumpPython=%s.py" % os.path.join(options.outputDir, dsetName)), interactive=True)[2] if ret != 0: print ret, out continue if not options.dry_run: parallel.run(job, jobargs) ## outfiles.append( outfile ) output = self.getHadd(out, outfile) outfiles.append(output) jobs.append((job, jobargs, output, 0, -1)) poutfiles[name][1].append(outfiles[-1]) print task_config = { "jobs": jobs, "datasets_output": doutfiles, "process_output": poutfiles, "output": outfiles, "outputPfx": outputPfx } with open("%s/task_config.json" % (options.outputDir), "w+") as cfout: cfout.write(json.dumps(task_config, indent=4)) cfout.close() def monitor(self): (options, args) = (self.options, self.args) parallel = self.parallel with open("%s/task_config.json" % (options.outputDir), "r") as cfin: task_config = json.loads(cfin.read()) doutfiles = task_config["datasets_output"] poutfiles = task_config["process_output"] outfiles = task_config["output"] outputPfx = task_config["outputPfx"] if not options.dry_run: ## FIXME: job resubmission self.jobs = task_config["jobs"] returns = self.wait(parallel, self) task_config["jobs"] = self.jobs if options.hadd: print "All jobs finished. Merging output." p = Parallel(options.ncpu) hadd = "hadd -f " if options.hadd_process: for proc, out in poutfiles.iteritems(): outfile, outfiles = out p.run("%s %s" % (hadd, outfile), outfiles) if options.hadd_dataset: if options.hadd_process: hadd += " -T" for dset, out in doutfiles.iteritems(): outfile, outfiles = out p.run("%s %s" % (hadd, outfile), outfiles) if not (options.hadd_process or options.hadd_dataset): p.run("%s %s.root" % (hadd, outputPfx), outfiles) self.wait(p) with open("%s/task_config.json" % (options.outputDir), "w+") as cfout: cfout.write(json.dumps(task_config, indent=4)) cfout.close() self.parallel.stop() def wait(self, parallel, handler=None): return parallel.wait(handler) ### for i in range(parallel.njobs): ### print "Finished jobs: %d. Total jobs: %d" % (i, parallel.njobs) ### job, jobargs, ret = parallel.returned.get() ### print "finished: %s %s" % ( job, " ".join(jobargs) ) ### for line in ret[1].split("\n"): ### print line def handleJobOutput(self, job, jobargs, ret): print "------------" print "Job finished: (exit code %d) '%s' '%s'" % (ret[0], job, " ".join(jobargs)) print "Job output: " print for line in ret[1].split("\n"): print line print jobargs = shell_args(" ".join(jobargs)) job = jobargs[0] jobargs = jobargs[1:] for ijob in self.jobs: inam, iargs = ijob[0:2] ### print inam, job, inam == job ### for i,a in enumerate(iargs): ### b = jobargs[i] ### print a, b, a == b if inam == job and iargs == jobargs: ijob[4] = ret[0] if ret[0] != 0: print "" print "Job failed. Number of resubmissions: %d / %d. " % ( ijob[3], self.maxResub), if ijob[3] < self.maxResub: print "Resubmitting." self.parallel.run(inam, iargs) ijob[3] += 1 print "------------" return 1 else: print "Giving up." print "------------" return 0 def getHadd(self, stg, fallback): for line in stg.split("\n"): if line.startswith("hadd:"): return line.replace("hadd:", "") return fallback def getMaxJobs(self, stg): for line in stg.split("\n"): if line.startswith("maxJobs:"): return int(line.replace("maxJobs:", "")) return -1
class JobsManager(object): def __init__(self, defaults={} ): """ Constructur: @defaults: default options """ # Command line options parser = OptionParser(option_list=[ make_option("--processes", action="callback", callback=Load(), type="string", dest="processes", default={}, help="List of datasets to be analyzed"), make_option("--load", # special option to load whole configuaration from JSON action="callback",callback=Load(),dest="__opts__", type="string", help="load JSON file with configuration",metavar="CONFIG.json" ), make_option("-n","--njobs",dest="njobs",type="int",default=0, help="number of jobs to run"), make_option("-q","--queue",dest="queue",type="string",default=None, help="LSF queue to use. default: %default"), make_option("-o","--output",dest="output",type="string", default="output.root", help="output file name. default: %default"), make_option("-d","--outputDir",dest="outputDir",type="string", default=None, help="output folder. default: %default"), make_option("-x","--jobEx",dest="jobExe",type="string", default=None, help="job executable. default: %default"), make_option("-c","--cmdLine",dest="cmdLine",type="string", default=None, help="job command line. The script arguments will be prepended. default: %default"), make_option("--dumpCfg", action="store_true", default=False, help="dump configuaration and exit. default: %default"), make_option("-v","--verbose", action="store_true", dest="verbose", default=False, help="default: %default"), make_option("-m","--max-resubmissions",dest="maxResub", type="int",default=3), make_option("-N","--ncpu",dest="ncpu", type="int",default=cpu_count()), make_option("-H","--hadd",dest="hadd",default=False, action="store_true", help="hadd output files when all jobs are finished." ), make_option("-D","--hadd-dateset",dest="hadd_dataset",default=False, action="store_true", help="hadd output per dataset when all jobs are finished." ), make_option("-P","--hadd-process",dest="hadd_process",default=False, action="store_true", help="hadd output per process when all jobs are finished." ), make_option("--dry-run",dest="dry_run",default=False, action="store_true", help="do not actually run the jobs." ), make_option("-C","--cont",dest="cont",default=False, action="store_true", help="continue interrupted task." ), ] ) # parse the command line (self.options, self.args) = parser.parse_args() self.maxResub = self.options.maxResub if self.options.cmdLine: self.args = self.args+shell_args(str(self.options.cmdLine)) if self.options.jobExe: self.args = [shell_expand(self.options.jobExe)]+self.args def __call__(self): """ __call__ Run all jobs. """ self.parallel = Parallel(self.options.ncpu,lsfQueue=self.options.queue,lsfJobName="%s/runJobs" % self.options.outputDir,asyncLsf=False) self.jobs = None if self.options.cont: pass else: self.firstRun() self.monitor() def firstRun(self): (options,args) = (self.options, self.args) parallel = self.parallel outputPfx = options.output.replace(".root","") if not options.outputDir: sys.exit("Please specify an output folder") if options.dumpCfg: print ( dumpCfg(options) ) sys.exit(0) if not os.path.exists(options.outputDir): os.mkdir(options.outputDir) outputPfx = "%s/%s" % ( options.outputDir, outputPfx ) args.append("processIdMap=%s/config.json" % options.outputDir) ## options.cmdLine += " %s" % (" ".join(args)) options.cmdLine = str(" ".join(args)) with open("%s/config.json" % (options.outputDir), "w+" ) as fout: fout.write( dumpCfg(options) ) outfiles = [] doutfiles = {} poutfiles = {} jobs = [] for name,datasets in options.processes.iteritems(): poutfiles[name] = ( "%s_%s.root" % ( outputPfx,name), [] ) for dset in datasets: job = args[0] if self.options.jobExe: pyjob = "" else: pyjob = job jobargs = copy(args[1:]) dsetName = dset.lstrip("/").replace("/","_") outfile = "%s_%s.root" % ( outputPfx, dsetName ) doutfiles[dset] = ( str(outfile),[] ) jobargs.extend( ["dataset=%s" % dset, "outputFile=%s" % outfile ] ) print "running: %s %s" % ( job, " ".join(jobargs) ) if options.njobs != 0: print "splitting in (up to) %d jobs\n checking how many are needed... " % options.njobs dnjobs = 0 dargs = jobargs+shell_args("nJobs=%d" % (options.njobs)) ret,out = parallel.run("python %s" % pyjob,dargs+shell_args("dryRun=1 getMaxJobs=1 dumpPython=%s.py" % os.path.join(options.outputDir,dsetName) ),interactive=True)[2] maxJobs = self.getMaxJobs(out) if maxJobs < 0: print "Error getting numer of jobs to be submitted" print out hadd = self.getHadd(out,outfile) ## for ijob in range(options.njobs): for ijob in range(maxJobs): ## FIXME allow specific job selection ## iargs = dargs+shell_args("jobId=%d" % (ijob)) iargs = jobargs+shell_args("nJobs=%d jobId=%d" % (maxJobs, ijob)) ## # run python <command-line> dryRun=1 to check if the job needs to be run ## ret,out = parallel.run("python %s" % pyjob,iargs+shell_args("dryRun=1"),interactive=True)[2] ## if ret != 0: ## continue dnjobs += 1 if not options.dry_run: ## FIXME: ## - handle output ## - store log files parallel.run(job,iargs) ## outfiles.append( outfile.replace(".root","_%d.root" % ijob) ) ## output = self.getHadd(out,outfile.replace(".root","_%d.root" % ijob)) output = hadd.replace(".root","_%d.root" % ijob) outfiles.append( output ) doutfiles[dset][1].append( outfiles[-1] ) poutfiles[name][1].append( outfiles[-1] ) jobs.append( (job,iargs,output,0,-1) ) print " %d jobs actually submitted" % dnjobs else: ret,out = parallel.run("python %s" % pyjob,jobargs+shell_args("dryRun=1 dumpPython=%s.py" % os.path.join(options.outputDir,dsetName)),interactive=True)[2] if ret != 0: print ret,out continue if not options.dry_run: parallel.run(job,jobargs) ## outfiles.append( outfile ) output = self.getHadd(out,outfile) outfiles.append( output ) jobs.append( (job,jobargs,output,0,-1) ) poutfiles[name][1].append( outfiles[-1] ) print task_config = { "jobs" : jobs, "datasets_output" : doutfiles, "process_output" : poutfiles, "output" : outfiles, "outputPfx" : outputPfx } with open("%s/task_config.json" % (options.outputDir), "w+" ) as cfout: cfout.write( json.dumps(task_config,indent=4) ) cfout.close() def monitor(self): (options,args) = (self.options, self.args) parallel = self.parallel with open("%s/task_config.json" % (options.outputDir), "r" ) as cfin: task_config = json.loads(cfin.read()) doutfiles = task_config["datasets_output"] poutfiles = task_config["process_output"] outfiles = task_config["output"] outputPfx = task_config["outputPfx"] if not options.dry_run: ## FIXME: job resubmission self.jobs = task_config["jobs"] returns = self.wait(parallel,self) task_config["jobs"] = self.jobs if options.hadd: print "All jobs finished. Merging output." p = Parallel(options.ncpu) hadd = "hadd -f " if options.hadd_process: for proc,out in poutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd, outfile), outfiles ) if options.hadd_dataset: if options.hadd_process: hadd += " -T" for dset,out in doutfiles.iteritems(): outfile,outfiles = out p.run("%s %s" % (hadd,outfile), outfiles) if not (options.hadd_process or options.hadd_dataset): p.run("%s %s.root" % (hadd,outputPfx), outfiles) self.wait(p) with open("%s/task_config.json" % (options.outputDir), "w+" ) as cfout: cfout.write( json.dumps(task_config,indent=4) ) cfout.close() self.parallel.stop() def wait(self,parallel,handler=None): return parallel.wait(handler) ### for i in range(parallel.njobs): ### print "Finished jobs: %d. Total jobs: %d" % (i, parallel.njobs) ### job, jobargs, ret = parallel.returned.get() ### print "finished: %s %s" % ( job, " ".join(jobargs) ) ### for line in ret[1].split("\n"): ### print line def handleJobOutput(self,job,jobargs,ret): print "------------" print "Job finished: (exit code %d) '%s' '%s'" % ( ret[0], job, " ".join(jobargs) ) print "Job output: " print for line in ret[1].split("\n"): print line print jobargs = shell_args(" ".join(jobargs)) job = jobargs[0] jobargs = jobargs[1:] for ijob in self.jobs: inam,iargs = ijob[0:2] ### print inam, job, inam == job ### for i,a in enumerate(iargs): ### b = jobargs[i] ### print a, b, a == b if inam == job and iargs == jobargs: ijob[4] = ret[0] if ret[0] != 0: print "" print "Job failed. Number of resubmissions: %d / %d. " % (ijob[3], self.maxResub), if ijob[3] < self.maxResub: print "Resubmitting." self.parallel.run(inam,iargs) ijob[3] += 1 print "------------" return 1 else: print "Giving up." print "------------" return 0 def getHadd(self,stg,fallback): for line in stg.split("\n"): if line.startswith("hadd:"): return line.replace("hadd:","") return fallback def getMaxJobs(self,stg): for line in stg.split("\n"): if line.startswith("maxJobs:"): return int(line.replace("maxJobs:","")) return -1
class SamplesManager(object): def __init__(self, catalog, cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"], dbs_instance="prod/phys03", queue=None, maxThreads=200,force=False,doContinue=False ): """ Constructur: @catalog: json file used to read/write dataset information @cross_sections: json file where samples cross sections are stored @dbs_instance: DBS instance tp use """ self.cross_sections_ = {} self.dbs_instance_ = dbs_instance for xsecFile in cross_sections: fname = shell_expand(xsecFile) self.cross_sections_.update( json.loads( open(fname).read() ) ) self.catalog_ = shell_expand(catalog) self.parallel_ = None self.sem_ = Semaphore() print "Will use the following datasets catalog:" print self.catalog_ self.queue_ = queue self.maxThreads_ = maxThreads self.force_ = force self.continue_ = doContinue def importFromDAS(self,list_datasets): """ Import datasets from DAS to the catalog. @datasets: wildecard to be usd in dataset query """ catalog = self.readCatalog() print "Importing from das %s" % list_datasets datasets = [] for dataset in list_datasets: if "*" in dataset: response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % dataset, 0, 0, False, self.dbs_instance_) for d in response["data"]: datasets.append( d["dataset"][0]["name"] ) else: datasets.append(dataset) print "Datasets to import" print "\n".join(datasets) for dsetName in datasets: print "Importing %s" % dsetName files = self.getFilesFomDAS(dsetName) self.addToDataset(catalog,dsetName,files) ## if dsetName in catalog: ## if self.force_: ## catalog[ dsetName ]["files"] = files ## else: ## self.mergeDataset(catalog[ dsetName ],{ "files" : files }) ## else: ## catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomDAS(self,dsetName): """ Read dataset files from DAS. @dsetName: dataset name """ response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_) files=[] for d in response["data"]: for jf in d["file"]: if "nevents" in jf: files.append({ "name" : jf["name"], "nevents" : jf["nevents"] }) break ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } ) return files def importFromEOS(self,folders): """ Import datasets from DAS to the catalog. @datasets: dataset to be imported """ catalog = self.readCatalog() auto=False assumeOk=False for folder in folders: dsetName = "" print print "importing folder\n %s" % folder while not len(dsetName.split("/")) == 4: if auto: splitFolder = folder.split("/") prim, sec = splitFolder[-4:-2] dsetName = "/%s/%s/USER" % (prim,sec) print "guessed dataset name ", dsetName if not assumeOk: resp=ask_user("ok?",["y","n","a"]) if resp == "n": dsetName = "" auto=False elif resp=="a": assumeOk=True if not auto: print "enter dataset name (auto/noauto to enables/disables automatic guessing) ", dsetName = raw_input() if(dsetName=="auto"): auto=True elif (dsetName=="noauto"): auto=False print "Importing %s as %s" % (folder,dsetName) files = self.getFilesFomEOS(folder) self.addToDataset(catalog,dsetName,files) ## if dsetName in catalog: ## catalog[ dsetName ]["files"] = files ## else: ## catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomEOS(self,dsetName): """ Read dataset files crawling EOS. @dsetName: dataset name Note: not implemented """ if not self.parallel_: self.parallel_ = Parallel(200,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True) ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2] ## print out files = [] for line in out.split("\n"): if line.endswith(".root"): files.append( {"name":line.replace("/eos/cms",""), "nevents":0} ) return files def findDuplicates(self,dsetName): """ Find duplicate job outputs in dataset. @dsetName: dataset name Note: not implemented """ pass def invalidateBadFiles(self,dsetName): """ Invalidate duplicate job output and corrupted files in DAS. @dsetName: dataset name Note: not implemented """ pass def checkAllDatasets(self,match=None,light=False): """ Look for corrupted files in the whole catalog. """ catalog = self.readCatalog() self.parallel_ = Parallel(50,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True,lsfJobName=".fgg/job") ## self.parallel_ = Parallel(1,self.queue_) print "Checking all datasets" self.outcomes = [] for dataset in catalog.keys(): if match and not fnmatch(dataset,match): continue self.checkDatasetFiles(dataset,catalog,light=light) # write catalog to avoid redoing duplicates removal self.writeCatalog(catalog) if self.queue_: self.parallel_.wait(printOutput=True,handler=self) outcomes = self.outcomes else: outcomes = self.parallel_.wait(printOutput=False) ## for dsetName,ifile,fName,ret,out in outcomes: nfailed = 0 for oc in outcomes: ign1, ign2, outcome= oc ## for ign1, ign2, outcome in outcomes: dsetName,ifile,fName,ret,out = outcome info = catalog[dsetName]["files"][ifile] if info["name"] != fName: print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out else: if ret != 0: info["bad"] = True nfailed += 1 else: info["bad"] = False extraInfo = json.loads(str(out)) if len(extraInfo.keys()) == 0: nfailed += 1 info["bad"] = True for key,val in extraInfo.iteritems(): info[key] = val self.parallel_.stop() print "Writing catalog" self.writeCatalog(catalog) print "Done" if nfailed > 0: print print "WARNING: some of the check jobs failed or did not return any output." print " Those (%d) files were marked a bad and won't be usable for analysis." % nfailed print " Re-running the check command may recover the temporary failures." print if self.queue_: print print "Note: log files may have been written in ./.fgg" print " it's up to you to clean up though..." def checkDatasetFiles(self,dsetName,catalog=None,light=False): """ Look for corrupted files in dataset. @dsetName: dataset name Note: not implemented """ writeCatalog = False if not catalog: catalog = self.readCatalog() writeCatalog = True wait = False if not self.parallel_: self.parallel_ = Parallel(16,self.queue_,maxThreads=self.maxThreads_,asyncLsf=True) wait = True print print "Checking dataset",dsetName info = catalog[dsetName] files = info["files"] print "Number of files: ", len(files) if self.force_ or not catalog[dsetName].get("vetted",False): toremove = [] keep_wildcard=None for ifil,eifil in enumerate(files): if ifil in toremove: continue for jfil,ejfil in enumerate(files[ifil+1:]): if ifil+jfil in toremove: continue if eifil["name"] == ejfil["name"]: toremove.append(ifil) else: iid = eifil["name"].rstrip(".root").rsplit("_",1)[-1] jid = ejfil["name"].rstrip(".root").rsplit("_",1)[-1] if iid == jid: if not keep_wildcard: print "duplicated file index ", iid print eifil["name"] print ejfil["name"] reply=ask_user("keep both (yes/no/matching)? ",["y","n","m"]) if reply == "m": while not keep_wildcard: print "enter wildcard matching expression", keep_wildcard=raw_input() if ask_user("keep all files matching '%s'?" % keep_wildcard) == "n": keep_wildcard=None if keep_wildcard: imatch=fnmatch(eifil["name"],keep_wildcard) jmatch=fnmatch(ejfil["name"],keep_wildcard) if imatch != jmatch: if imatch: toremove.append(ifil+jfil) else: toremove.append(ifil) continue else: print "duplicated file index ", iid print eifil["name"] print ejfil["name"] reply=ask_user("keep both? ") if reply == "n": if ask_user( "keep %s? " % ejfil["name"] ) == "n": ## files.pop(ifil+jfil) toremove.append(ifil+jfil) if ask_user( "keep %s? " % eifil["name"] ) == "n": toremove.append(ifil) ## files.pop(ifil) for ifile in sorted(toremove,reverse=True): ## print ifile files.pop(ifile) print "After duplicates removal: ", len(files) nsub = 0 catalog[dsetName]["vetted"] = True if not light: info = catalog[dsetName]["files"] = files for ifile,finfo in enumerate(files): name = finfo["name"] if self.force_ or not "weights" in finfo: nsub+=1 self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile],interactive=(self.queue_!=None)) if nsub == 0: print "No files needed to be checked" else: print "Submitted %d check jobs" % nsub if wait: self.parallel_.wait(printOutput=False) self.parallel_ = None if writeCatalog: self.writeCatalog(catalog) def reviewCatalog(self): datasets,catalog = self.getAllDatasets() primaries = {} keepAll = False for d in datasets: if not keepAll: reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"]) if reply == "n": catalog.pop(d) continue if reply == "a": keepAll = True primary = d.split("/")[1] if not primary in primaries: primaries[ primary ] = [] primaries[ primary ].append(d) for name,val in primaries.iteritems(): if len(val) == 1: continue reply = ask_user("More than one sample for %s:\n %s\nKeep all (yes/no/merge)?" % (name,"\n ".join(val)),["y","n","m"]) if reply == "m": dst = val[0] for merge in val[1:]: self.mergeDataset(catalog[dst],catalog[merge]) catalog.pop(merge) if reply == "n": for d in val: reply = ask_user("keep this dataset?\n %s\n" % d) if reply == "n": catalog.pop(d) self.writeCatalog(catalog) def mergeDataset(self,dst,merge): dst["vetted"]=False dstFiles=dst["files"] mergeFiles=merge["files"] for fil in mergeFiles: skip = False for dfil in dstFiles: if dfil["name"] == fil["name"]: skip = True if not skip: dstFiles.append( fil ) def addToDataset(self,catalog,dsetName,files): if dsetName in catalog: if self.force_: catalog[ dsetName ]["files"] = files else: self.mergeDataset(catalog[ dsetName ],{ "files" : files }) else: catalog[ dsetName ] = { "files" : files } def checkFile(self,fileName,dsetName,ifile): """ Check if file is valid. @fileName: file name """ fName = fileName tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile) if self.continue_: if os.path.exists(tmp): print "%s already exists" % tmp outcome = self.readJobOutput(tmp,0,"",dsetName,fileName,ifile) if self.queue_: self.outcomes.append((None,None,outcome)) else: return outcome return None if self.queue_: self.parallel_.run("fggCheckFile.py",[fName,tmp,dsetName,str(ifile),"2>/dev/null"],interactive=False) else: ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,dsetName,str(ifile),"2>/dev/null"],interactive=True)[2] return self.readJobOutput(tmp,ret,out,dsetName,fileName,ifile) ### try: ### fout = open(tmp) ### out = fout.read() ### fout.close() ### except IOError, e: ### print ret, out ### print e ### out = "{}" ### ### os.remove(tmp) ### return dsetName,ifile,fileName,ret,out def readJobOutput(self,tmp,ret,out,dsetName,fileName,ifile): try: fout = open(tmp) out = fout.read() fout.close() os.remove(tmp) except Exception, e: print ret, out print e out = "{}" return dsetName,int(ifile),fileName,ret,out
class SamplesManager(object): def __init__(self, catalog, cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"], dbs_instance="prod/phys03", queue=None ): """ Constructur: @catalog: json file used to read/write dataset information @cross_sections: json file where samples cross sections are stored @dbs_instance: DBS instance tp use """ self.cross_sections_ = {} self.dbs_instance_ = dbs_instance for xsecFile in cross_sections: fname = shell_expand(xsecFile) self.cross_sections_.update( json.loads( open(fname).read() ) ) self.catalog_ = shell_expand(catalog) self.parallel_ = None self.sem_ = Semaphore() print "Will use the following datasets catalog:" print self.catalog_ self.queue_ = queue def importFromDAS(self,datasets): """ Import datasets from DAS to the catalog. @datasets: wildecard to be usd in dataset query """ catalog = self.readCatalog() print "Importing from das %s" % datasets if "*" in datasets: response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % datasets, 0, 0, False, self.dbs_instance_) datasets=[] for d in response["data"]: datasets.append( d["dataset"][0]["name"] ) print "Datasets to import" print "\n".join(datasets) for dsetName in datasets: print "Importing %s" % dsetName files = self.getFilesFomDAS(dsetName) if dsetName in catalog: catalog[ dsetName ]["files"] = files else: catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomDAS(self,dsetName): """ Read dataset files from DAS. @dsetName: dataset name """ response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_) files=[] for d in response["data"]: for jf in d["file"]: if "nevents" in jf: files.append({ "name" : jf["name"], "nevents" : jf["nevents"] }) break ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } ) return files def importFromEOS(self,folders): """ Import datasets from DAS to the catalog. @datasets: dataset to be imported """ catalog = self.readCatalog() for folder in folders: dsetName = "" while not len(dsetName.split("/")) == 4: print "enter dataset name for folder %s" % folder, dsetName = raw_input() print "Importing %s as %s" % (folder,dsetName) files = self.getFilesFomEOS(folder) if dsetName in catalog: catalog[ dsetName ]["files"] = files else: catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomEOS(self,dsetName): """ Read dataset files crawling EOS. @dsetName: dataset name Note: not implemented """ if not self.parallel_: self.parallel_ = Parallel(200,self.queue_) ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2] print out files = [] for line in out.split("\n"): if line.endswith(".root"): files.append( {"name":line.replace("/eos/cms",""), "nevents":0} ) return files def findDuplicates(self,dsetName): """ Find duplicate job outputs in dataset. @dsetName: dataset name Note: not implemented """ pass def invalidateBadFiles(self,dsetName): """ Invalidate duplicate job output and corrupted files in DAS. @dsetName: dataset name Note: not implemented """ pass def checkAllDatasets(self): """ Look for corrupted files in the whole catalog. """ catalog = self.readCatalog() self.parallel_ = Parallel(50,self.queue_) ## self.parallel_ = Parallel(1,self.queue_) print "Checking all datasets" for dataset in catalog.keys(): self.checkDatasetFiles(dataset,catalog) outcomes = self.parallel_.wait() for dsetName,ifile,fName,ret,out in outcomes: info = catalog[dsetName]["files"][ifile] if info["name"] != fName: print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out else: if ret != 0: info["bad"] = True else: extraInfo = json.loads(str(out)) for key,val in extraInfo.iteritems(): info[key] = val print "Writing catalog" self.writeCatalog(catalog) print "Done" def checkDatasetFiles(self,dsetName,catalog=None): """ Look for corrupted files in dataset. @dsetName: dataset name Note: not implemented """ writeCatalog = False if not catalog: catalog = self.readCatalog() writeCatalog = True wait = False if not self.parallel_: self.parallel_ = Parallel(16,self.queue_) wait = True print "Checking dataset",dsetName info = catalog[dsetName] files = info["files"] print len(files) for ifile,finfo in enumerate(files): name = finfo["name"] self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile]) if wait: self.parallel_.wait() self.parallel_ = None if writeCatalog: self.writeCatalog(catalog) def reviewCatalog(self): datasets,catalog = self.getAllDatasets() primaries = {} keepAll = False for d in datasets: if not keepAll: reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"]) if reply == "n": catalog.pop(d) continue if reply == "a": keepAll = True primary = d.split("/")[1] if not primary in primaries: primaries[ primary ] = [] primaries[ primary ].append(d) for name,val in primaries.iteritems(): if len(val) == 1: continue reply = ask_user("More than one sample for %s:\n %s\nKeep all?" % (name,"\n ".join(val))) if reply == "n": for d in val: reply = ask_user("keep this dataset?\n %s\n" % d) if reply == "n": catalog.pop(d) self.writeCatalog(catalog) def checkFile(self,fileName,dsetName,ifile): """ Check if file is valid. @fileName: file name """ ## fName = "root://eoscms//eos/cms%s" % fileName fName = fileName tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile) ## print "fggCheckFile.py",[fName,tmp,"2>/dev/null"] ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,"2>/dev/null"],interactive=True)[2] try: fout = open(tmp) out = fout.read() fout.close() except IOError, e: print ret, out print e out = "{}" os.remove(tmp) return dsetName,ifile,fileName,ret,out
class SamplesManager(object): def __init__(self, catalog, cross_sections=["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"], dbs_instance="prod/phys03", queue=None ): """ Constructur: @catalog: json file used to read/write dataset information @cross_sections: json file where samples cross sections are stored @dbs_instance: DBS instance tp use """ self.cross_sections_ = {} self.dbs_instance_ = dbs_instance for xsecFile in cross_sections: fname = shell_expand(xsecFile) self.cross_sections_.update( json.loads( open(fname).read() ) ) self.catalog_ = shell_expand(catalog) self.parallel_ = None self.sem_ = Semaphore() print "Will use the following datasets catalog:" print self.catalog_ self.queue_ = queue def importFromDAS(self,list_datasets): """ Import datasets from DAS to the catalog. @datasets: wildecard to be usd in dataset query """ catalog = self.readCatalog() print "Importing from das %s" % list_datasets datasets = [] for dataset in list_datasets: if "*" in dataset: response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % dataset, 0, 0, False, self.dbs_instance_) for d in response["data"]: datasets.append( d["dataset"][0]["name"] ) else: datasets.append(dataset) print "Datasets to import" print "\n".join(datasets) for dsetName in datasets: print "Importing %s" % dsetName files = self.getFilesFomDAS(dsetName) if dsetName in catalog: catalog[ dsetName ]["files"] = files else: catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomDAS(self,dsetName): """ Read dataset files from DAS. @dsetName: dataset name """ response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_) files=[] for d in response["data"]: for jf in d["file"]: if "nevents" in jf: files.append({ "name" : jf["name"], "nevents" : jf["nevents"] }) break ## files.append( { "name" : d["file"][0]["name"], "nevents" : d["file"][0]["nevents"] } ) return files def importFromEOS(self,folders): """ Import datasets from DAS to the catalog. @datasets: dataset to be imported """ catalog = self.readCatalog() auto=False assumeOk=False for folder in folders: dsetName = "" print print "importing folder\n %s" % folder while not len(dsetName.split("/")) == 4: if auto: splitFolder = folder.split("/") prim, sec = splitFolder[-4:-2] dsetName = "/%s/%s/USER" % (prim,sec) print "guessed dataset name ", dsetName if not assumeOk: resp=ask_user("ok?",["y","n","a"]) if resp == "n": dsetName = "" auto=False elif resp=="a": assumeOk=True if not auto: print "enter dataset name (auto/noauto to enables/disables automatic guessing) ", dsetName = raw_input() if(dsetName=="auto"): auto=True elif (dsetName=="noauto"): auto=False print "Importing %s as %s" % (folder,dsetName) files = self.getFilesFomEOS(folder) if dsetName in catalog: catalog[ dsetName ]["files"] = files else: catalog[ dsetName ] = { "files" : files } print "Writing catalog" self.writeCatalog(catalog) print "Done" def getFilesFomEOS(self,dsetName): """ Read dataset files crawling EOS. @dsetName: dataset name Note: not implemented """ if not self.parallel_: self.parallel_ = Parallel(200,self.queue_) ret,out = self.parallel_.run("/afs/cern.ch/project/eos/installation/0.3.15/bin/eos.select",["find",dsetName],interactive=True)[2] ## print out files = [] for line in out.split("\n"): if line.endswith(".root"): files.append( {"name":line.replace("/eos/cms",""), "nevents":0} ) return files def findDuplicates(self,dsetName): """ Find duplicate job outputs in dataset. @dsetName: dataset name Note: not implemented """ pass def invalidateBadFiles(self,dsetName): """ Invalidate duplicate job output and corrupted files in DAS. @dsetName: dataset name Note: not implemented """ pass def checkAllDatasets(self): """ Look for corrupted files in the whole catalog. """ catalog = self.readCatalog() self.parallel_ = Parallel(50,self.queue_) ## self.parallel_ = Parallel(1,self.queue_) print "Checking all datasets" for dataset in catalog.keys(): self.checkDatasetFiles(dataset,catalog) outcomes = self.parallel_.wait(printOutput=False) ## for dsetName,ifile,fName,ret,out in outcomes: for ign1, ign2, outcome in outcomes: dsetName,ifile,fName,ret,out = outcome info = catalog[dsetName]["files"][ifile] if info["name"] != fName: print "Inconsistent outcome ", info["name"], dsetName,ifile,fName,ret,out else: if ret != 0: info["bad"] = True else: extraInfo = json.loads(str(out)) for key,val in extraInfo.iteritems(): info[key] = val print "Writing catalog" self.writeCatalog(catalog) print "Done" def checkDatasetFiles(self,dsetName,catalog=None): """ Look for corrupted files in dataset. @dsetName: dataset name Note: not implemented """ writeCatalog = False if not catalog: catalog = self.readCatalog() writeCatalog = True wait = False if not self.parallel_: self.parallel_ = Parallel(16,self.queue_) wait = True print print "Checking dataset",dsetName info = catalog[dsetName] files = info["files"] print "Number of files: ", len(files) toremove = [] for ifil,eifil in enumerate(files): if ifil in toremove: continue for jfil,ejfil in enumerate(files[ifil+1:]): if ifil+jfil in toremove: continue if eifil["name"] == ejfil["name"]: toremove.append(ifil) else: iid = eifil["name"].rstrip(".root").rsplit("_",1)[-1] jid = ejfil["name"].rstrip(".root").rsplit("_",1)[-1] if iid == jid: print "duplicated file index ", iid print eifil["name"] print ejfil["name"] reply=ask_user("keep both? ") if reply == "n": if ask_user( "keep %s? " % ejfil["name"] ) == "n": ## files.pop(ifil+jfil) toremove.append(ifil+jfil) if ask_user( "keep %s? " % eifil["name"] ) == "n": toremove.append(ifil) ## files.pop(ifil) for ifile in sorted(toremove,reverse=True): ## print ifile files.pop(ifile) print "After duplicates removal: ", len(files) info = catalog[dsetName]["files"] = files for ifile,finfo in enumerate(files): name = finfo["name"] self.parallel_.run(SamplesManager.checkFile,[self,name,dsetName,ifile]) if wait: self.parallel_.wait(printOutput=False) self.parallel_ = None if writeCatalog: self.writeCatalog(catalog) def reviewCatalog(self): datasets,catalog = self.getAllDatasets() primaries = {} keepAll = False for d in datasets: if not keepAll: reply = ask_user("keep this dataset (yes/no/all)?\n %s\n" % d, ["y","n","a"]) if reply == "n": catalog.pop(d) continue if reply == "a": keepAll = True primary = d.split("/")[1] if not primary in primaries: primaries[ primary ] = [] primaries[ primary ].append(d) for name,val in primaries.iteritems(): if len(val) == 1: continue reply = ask_user("More than one sample for %s:\n %s\nKeep all?" % (name,"\n ".join(val)),["y","n","m"]) if reply == "m": dst = val[0] for merge in val[1:]: self.mergeDataset(catalog[dst],catalog[merge]) catalog.pop(merge) if reply == "n": for d in val: reply = ask_user("keep this dataset?\n %s\n" % d) if reply == "n": catalog.pop(d) self.writeCatalog(catalog) def mergeDataset(self,dst,merge): dstFiles=dst["files"] mergeFiles=merge["files"] for fil in mergeFiles: skip = False for dfil in dstFiles: if dfil["name"] == fil["name"]: skip = True if not skip: dstFiles.append( fil ) def checkFile(self,fileName,dsetName,ifile): """ Check if file is valid. @fileName: file name """ ## fName = "root://eoscms//eos/cms%s" % fileName fName = fileName tmp = ".tmp%s_%d.json"%(dsetName.replace("/","_"),ifile) ## print "fggCheckFile.py",[fName,tmp,"2>/dev/null"] ret,out = self.parallel_.run("fggCheckFile.py",[fName,tmp,"2>/dev/null"],interactive=True)[2] try: fout = open(tmp) out = fout.read() fout.close() except IOError, e: print ret, out print e out = "{}" os.remove(tmp) return dsetName,ifile,fileName,ret,out