def getfiles(self, refresh=False, url=True, verb=0): """Get list of files from DAS.""" files = self.files if self.refreshable and (not files or refresh): files = [] for path in self.paths: if self.storage: # get files from storage system sepath = repkey(self.storage, PATH=path).replace('//', '/') storage = getstorage(sepath, verb=verb - 1) outlist = storage.getfiles(url=url, verb=verb - 1) else: # get files from DAS dascmd = 'dasgoclient --query="file dataset=%s instance=%s"' % ( path, self.instance) #--limit=0 LOG.verb(repr(dascmd), verb) cmdout = execute(dascmd, verb=verb - 1) outlist = cmdout.split(os.linesep) for line in outlist: # filter root files line = line.strip() if line.endswith('.root') and not any( f.endswith(line) for f in self.blacklist): if url and self.url not in line and 'root://' not in line: line = self.url + line files.append(line) files.sort() # for consistent list order self.files = files return files
def getsamples(era, channel="", tag="", dtype=[], filter=[], veto=[], moddict={}, verb=0): """Help function to get samples from a sample list and filter if needed.""" CONFIG = GLOB.getconfig(verb=verb) filters = filter if isinstance(filter, list) else [filter] vetoes = veto if isinstance(veto, list) else [veto] dtypes = dtype if isinstance(dtype, list) else [dtype] sampfile = ensurefile( "samples", repkey(CONFIG.eras[era], ERA=era, CHANNEL=channel, TAG=tag)) samppath = sampfile.replace('.py', '').replace('/', '.') if samppath not in moddict: moddict[samppath] = importlib.import_module( samppath) # save time by loading once if not hasattr(moddict[samppath], 'samples'): LOG.throw( IOError, "Module '%s' must have a list of Sample objects called 'samples'!" % (samppath)) samplelist = moddict[samppath].samples samples = [] sampledict = {} # ensure for unique names for sample in samplelist: if filters and not sample.match(filters, verb): continue if vetoes and sample.match(vetoes, verb): continue if dtypes and sample.dtype not in dtypes: continue if sample.name in sampledict: LOG.throw( IOError, "Sample short names should be unique. Found two samples '%s'!\n\t%s\n\t%s" % (sample.name, ','.join( sampledict[sample.name].paths), ','.join(sample.paths))) if 'skim' in channel and len(sample.paths) >= 2: for subsample in sample.split(): samples.append( subsample ) # keep correspondence sample to one sample in DAS else: samples.append(sample) sampledict[sample.name] = sample return samples
def __init__(self, group, name, *paths, **kwargs): """Container class for CMSSW samples, e.g.: - group: DY (used to group similar samples in final output) - name: DYJetsToLL_M-50 (used as shorthand and jobname) - path: /DYJetsToLL_M-50_TuneCP5_13TeV-madgraphMLM-pythia8/RunIIAutumn18NanoAODv6_Nano25Oct2019_102X_mcRun2/NANOAODSIM - dtype: 'mc', 'data', 'embed' """ # PATH assert len(paths) >= 1, "Need at least one path to create a sample..." if len(paths) == 1 and isinstance(paths[0], list): paths = paths[0] for path in paths: assert path.count('/') >= 3 and path.startswith( '/'), "Path %s has wrong format. Need /SAMPLE/CAMPAIGN/FORMAT." #sample = '/'.join(line.split('/')[-3:]) # DATA TYPE dtype = kwargs.get('dtype', None) dtypes = ['mc', 'data', 'embed'] if dtype == None: # automatic recognition path = paths[0] if 'Embed' in path: dtype = 'embed' elif path.endswith('SIM') or any(g in path for g in ['pythia', 'madgraph']): dtype = 'mc' elif re.search(r"/Run20\d\d", path): dtype = 'data' assert dtype in dtypes, "Given data type '%s' is not recongized! Please choose from %s..." % ( dtype, ', '.join(dtypes)) # ATTRIBUTES self.group = group self.name = name self.paths = paths # DAS path self.dtype = dtype self.channels = kwargs.get('channels', None) self.storage = kwargs.get('store', None) # if stored elsewhere than DAS self.url = kwargs.get('url', None) self.blacklist = kwargs.get('blacklist', []) # black list file self.instance = kwargs.get( 'instance', 'prod/phys03' if path.endswith('USER') else 'prod/global') self.nfilesperjob = kwargs.get('nfilesperjob', -1) self.subtry = kwargs.get('subtry', 0) # to help keep track of resubmission self.jobcfg = kwargs.get('jobcfg', {}) # to help keep track of resubmission self.nevents = kwargs.get('nevents', 0) self.files = kwargs.get( 'files', []) # list of ROOT files, OR text file with list of files self.era = kwargs.get('era', "") # for expansion of $ERA variable self.verbosity = kwargs.get('verbosity', 0) # verbosity level for debugging self.refreshable = not self.files # allow refresh on file list in getfiles() # STORAGE & URL DEFAULTS if self.storage: self.storage = repkey(self.storage, ERA=self.era, GROUP=self.group, SAMPLE=self.name) if not self.url: if self.storage: from TauFW.PicoProducer.storage.StorageSystem import Local storage = getstorage(repkey(self.storage, PATH=self.paths[0])) if isinstance(storage, Local): self.url = "root://cms-xrd-global.cern.ch/" else: self.url = storage.fileurl else: self.url = "root://cms-xrd-global.cern.ch/" # GET FILE LIST FROM TEXT FILE if isinstance(self.files, str): filename = repkey(self.files, ERA=self.era, GROUP=self.group, SAMPLE=self.name) if self.verbosity >= 1: print ">>> Loading sample files from '%r'" % (filename) if self.verbosity >= 2: print ">>> %-14s = %s" % ('filelist', self.files) print ">>> %-14s = %s" % ('filename', filename) filelist = [] with open(filename, 'r') as file: for line in file: line = line.strip().split() if not line: continue infile = line[0].strip() if infile[0] == '#': continue if infile.endswith('.root'): filelist.append(infile) self.files = filelist self.files.sort()
def main_status(args): """Check status of jobs (succesful/pending/failed/missing), or hadd job output.""" if args.verbosity>=1: print ">>> main_status", args # SETTING eras = args.eras channels = args.channels tag = args.tag checkdas = args.checkdas checkqueue = args.checkqueue dtypes = args.dtypes filters = args.samples vetoes = args.vetoes force = args.force hadd = args.subcommand=='hadd' cleanup = args.cleanup if hadd else False dryrun = args.dryrun verbosity = args.verbosity cmdverb = max(1,verbosity) outdirformat = CONFIG.outdir jobdirformat = CONFIG.jobdir storedirformat = CONFIG.picodir jobs = [ ] # LOOP over ERAS for era in eras: # LOOP over CHANNELS for channel in channels: print header("%s, %s"%(era,channel)) # GET SAMPLES jobcfgs = repkey(os.path.join(jobdirformat,"config/jobconfig_$CHANNEL$TAG_try[0-9]*.json"), ERA=era,SAMPLE='*',GROUP='*',CHANNEL=channel,TAG=tag) if verbosity>=1: print ">>> %-12s = %s"%('cwd',os.getcwd()) print ">>> %-12s = %s"%('jobcfgs',jobcfgs) print ">>> %-12s = %s"%('filters',filters) print ">>> %-12s = %s"%('vetoes',vetoes) print ">>> %-12s = %s"%('dtypes',dtypes) samples = getcfgsamples(jobcfgs,filter=filters,veto=vetoes,dtype=dtypes,verb=verbosity) if verbosity>=2: print ">>> Found samples: "+", ".join(repr(s.name) for s in samples) if hadd and 'skim' in channel: LOG.warning("Hadding into one file not available for skimming...") print continue # SAMPLE over SAMPLES found = False for sample in samples: if sample.channels and channel not in sample.channels: continue found = True print ">>> %s"%(bold(sample.name)) for path in sample.paths: print ">>> %s"%(bold(path)) # CHECK JOBS ONLY ONCE if checkqueue==1 and not jobs: batch = getbatch(CONFIG,verb=verbosity) jobs = batch.jobs(verb=verbosity-1) # HADD if hadd: jobdir = sample.jobcfg['jobdir'] outdir = sample.jobcfg['outdir'] storedir = repkey(storedirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name, DAS=sample.paths[0].strip('/'),GROUP=sample.group) storage = getstorage(storedir,ensure=True,verb=verbosity) outfile = '%s_%s%s.root'%(sample.name,channel,tag) infiles = os.path.join(outdir,'*_%s%s_[0-9]*.root'%(channel,tag)) cfgfiles = os.path.join(sample.jobcfg['cfgdir'],'job*_%s%s_try[0-9]*.*'%(channel,tag)) logfiles = os.path.join(sample.jobcfg['logdir'],'*_%s%s_try[0-9]*.*.*.log'%(channel,tag)) if verbosity>=1: print ">>> Hadd'ing job output for '%s'"%(sample.name) print ">>> %-12s = %r"%('jobdir',jobdir) print ">>> %-12s = %r"%('outdir',outdir) print ">>> %-12s = %r"%('storedir',storedir) print ">>> %-12s = %s"%('infiles',infiles) print ">>> %-12s = %r"%('outfile',outfile) resubfiles, chunkdict = checkchuncks(sample,channel=channel,tag=tag,jobs=jobs, checkqueue=checkqueue,das=checkdas,verb=verbosity) if len(resubfiles)>0 and not force: LOG.warning("Cannot hadd job output because %d chunks need to be resubmitted..."%(len(resubfiles))+ "Please use -f or --force to hadd anyway.") continue #haddcmd = 'hadd -f %s %s'%(outfile,infiles) #haddout = execute(haddcmd,dry=dryrun,verb=max(1,verbosity)) haddout = storage.hadd(infiles,outfile,dry=dryrun,verb=cmdverb) #os.system(haddcmd) # CLEAN UP # TODO: check if hadd was succesful with isvalid if cleanup: rmfiles = "" rmfileset = [infiles,cfgfiles,logfiles] for files in rmfileset: if len(glob.glob(files))>0: rmfiles += ' '+files if verbosity>=2: print ">>> %-12s = %s"%('rmfileset',rmfileset) print ">>> %-12s = %s"%('rmfiles',rmfiles) if rmfiles: rmcmd = "rm %s"%(rmfiles) rmout = execute(rmcmd,dry=dryrun,verb=cmdverb) # ONLY CHECK STATUS else: outdir = sample.jobcfg['outdir'] if verbosity>=1: print ">>> Checking job status for '%s'"%(sample.name) print ">>> %-12s = %r"%('outdir',outdir) checkchuncks(sample,channel=channel,tag=tag,jobs=jobs, checkqueue=checkqueue,das=checkdas,verb=verbosity) print if not found: print ">>> Did not find any samples." print
def main_get(args): """Get information of given variable.""" if args.verbosity>=1: print ">>> main_get", args variable = args.variable eras = args.eras dtypes = args.dtypes filters = args.samples vetoes = args.vetoes channels = args.channels or [""] checkdas = args.checkdas writedir = args.write tag = args.tag verbosity = args.verbosity cfgname = CONFIG._path if verbosity>=1: print '-'*80 print ">>> %-14s = %s"%('variable',variable) print ">>> %-14s = %s"%('cfgname',cfgname) print ">>> %-14s = %s"%('config',CONFIG) print '-'*80 # SAMPLES if variable=='files': # LOOP over ERAS & CHANNELS if not eras: LOG.warning("Please specify an era to get a sample for.") for era in eras: for channel in channels: # VERBOSE if verbosity>=1: print ">>> %-12s = %r"%('channel',channel) # GET SAMPLES assert era in CONFIG.eras, "Era '%s' not found in the configuration file. Available: %s"%(era,CONFIG.eras) samples = getsamples(era,channel=channel,dtype=dtypes,filter=filters,veto=vetoes,verb=verbosity) # LOOP over SAMPLES for sample in samples: print ">>> %s"%(bold(sample.name)) for path in sample.paths: print ">>> %s"%(bold(path)) infiles = sample.getfiles(url=False,verb=verbosity+1) if checkdas: ndasevents = sample.getnevents(verb=verbosity+1) print ">>> %-12s = %s"%('ndasevents',ndasevents) print ">>> %-12s = %r"%('url',sample.url) print ">>> %-12s = %s"%('nfiles',len(infiles)) print ">>> %-12s = [ "%('infiles') for file in infiles: print ">>> %r"%file print ">>> ]" if writedir: flistname = repkey(writedir,ERA=era,GROUP=sample.group,SAMPLE=sample.name,TAG=tag) print ">>> Write list to %r..."%(flistname) ensuredir(os.path.dirname(flistname)) with open(flistname,'w+') as flist: for infile in infiles: flist.write(infile+'\n') # CONFIGURATION else: if variable in CONFIG: print ">>> Configuration of %r: %s"%(variable,color(CONFIG[variable])) else: print ">>> Did not find %r in the configuration"%(variable)
def preparejobs(args): """Help function to iterate over samples per given channel and era and prepare job config and list.""" if args.verbosity>=1: print ">>> preparejobs", args resubmit = args.subcommand=='resubmit' eras = args.eras channels = args.channels tag = args.tag dtypes = args.dtypes filters = args.samples vetoes = args.vetoes checkdas = args.checkdas checkqueue = args.checkqueue prefetch = args.prefetch nfilesperjob = args.nfilesperjob split_nfpj = args.split_nfpj verbosity = args.verbosity jobs = [ ] # LOOP over ERAS for era in eras: moddict = { } # save time by loading samples and get their file list only once # LOOP over CHANNELS for channel in channels: print header("%s, %s"%(era,channel)) # CHANNEL -> MODULE assert channel in CONFIG.channels, "Channel '%s' not found in the configuration file. Available: %s"%(channel,CONFIG.channels) module = CONFIG.channels[channel] if channel!='test' and 'skim' not in channel: ensuremodule(module) if verbosity>=1: print '-'*80 print ">>> %-12s = %r"%('channel',channel) print ">>> %-12s = %r"%('module',module) print ">>> %-12s = %s"%('filters',filters) print ">>> %-12s = %s"%('vetoes',vetoes) print ">>> %-12s = %r"%('dtypes',dtypes) # PROCESSOR if 'skim' in channel: processor = module elif channel=='test': processor = module else: processor = "picojob.py" procpath = os.path.join("python/processors",processor) if not os.path.isfile(procpath): LOG.throw(IOError,"Processor '%s' does not exist in '%s'..."%(processor,procpath)) processor = os.path.abspath(procpath) if verbosity>=1: print ">>> %-12s = %r"%('processor',processor) print '-'*80 # GET SAMPLES jobdirformat = CONFIG.jobdir # for job config & log files outdirformat = CONFIG.nanodir if 'skim' in channel else CONFIG.outdir # for job output if resubmit: # TODO: allow user to resubmit given config file jobcfgs = repkey(os.path.join(jobdirformat,"config/jobconfig_$SAMPLE$TAG_try[0-9]*.json"), ERA=era,SAMPLE='*',CHANNEL=channel,TAG=tag) if verbosity>=2: print ">>> %-12s = %s"%('cwd',os.getcwd()) print ">>> %-12s = %s"%('jobcfgs',jobcfgs) samples = getcfgsamples(jobcfgs,filter=filters,veto=vetoes,dtype=dtypes,verb=verbosity) else: assert era in CONFIG.eras, "Era '%s' not found in the configuration file. Available: %s"%(era,CONFIG.eras) samples = getsamples(era,channel=channel,tag=tag,dtype=dtypes,filter=filters,veto=vetoes,moddict=moddict,verb=verbosity) if verbosity>=2: print ">>> Found samples: "+", ".join(repr(s.name) for s in samples) # SAMPLE over SAMPLES found = False for sample in samples: if sample.channels and channel not in sample.channels: continue found = True print ">>> %s"%(bold(sample.name)) for path in sample.paths: print ">>> %s"%(bold(path)) # DIRECTORIES subtry = sample.subtry+1 if resubmit else 1 jobids = sample.jobcfg.get('jobids',[ ]) postfix = "_%s%s"%(channel,tag) jobtag = '_%s%s_try%d'%(channel,tag,subtry) jobname = sample.name+jobtag.rstrip('try1').rstrip('_') nfilesperjob_ = sample.nfilesperjob if sample.nfilesperjob>0 else nfilesperjob if split_nfpj>1: nfilesperjob_ = min(1,nfilesperjob_/split_nfpj) outdir = repkey(outdirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name, DAS=sample.paths[0].strip('/'),GROUP=sample.group) jobdir = ensuredir(repkey(jobdirformat,ERA=era,CHANNEL=channel,TAG=tag,SAMPLE=sample.name, DAS=sample.paths[0].strip('/'),GROUP=sample.group)) cfgdir = ensuredir(jobdir,"config") logdir = ensuredir(jobdir,"log") cfgname = "%s/jobconfig%s.json"%(cfgdir,jobtag) joblist = '%s/jobarglist%s.txt'%(cfgdir,jobtag) if verbosity==1: print ">>> %-12s = %s"%('cfgname',cfgname) print ">>> %-12s = %s"%('joblist',joblist) elif verbosity>=2: print '-'*80 print ">>> Preparing job %ssubmission for '%s'"%("re" if resubmit else "",sample.name) print ">>> %-12s = %r"%('processor',processor) print ">>> %-12s = %r"%('jobname',jobname) print ">>> %-12s = %r"%('jobtag',jobtag) print ">>> %-12s = %r"%('postfix',postfix) print ">>> %-12s = %r"%('outdir',outdir) print ">>> %-12s = %r"%('cfgdir',cfgdir) print ">>> %-12s = %r"%('logdir',logdir) print ">>> %-12s = %r"%('cfgname',cfgname) print ">>> %-12s = %r"%('joblist',joblist) print ">>> %-12s = %s"%('try',subtry) print ">>> %-12s = %r"%('jobids',jobids) # CHECKS if os.path.isfile(cfgname): # TODO: check for running jobs LOG.warning("Job configuration '%s' already exists and will be overwritten! "+ "Beware of conflicting job output!"%(cfgname)) if not resubmit: cfgpattern = re.sub(r"(?<=try)\d+(?=.json$)",r"*",cfgname) cfgnames = [f for f in glob.glob(cfgpattern) if not f.endswith("_try1.json")] if cfgnames: LOG.warning("Job configurations for resubmission already exists! This can cause conflicting job output!"+ "If you are sure you want to submit from scratch, please remove these files:\n>>> "+"\n>>> ".join(cfgnames)) storage = getstorage(outdir,verb=verbosity,ensure=True) # GET FILES nevents = 0 if resubmit: # resubmission if checkqueue==0 and not jobs: # check jobs only once batch = getbatch(CONFIG,verb=verbosity) jobs = batch.jobs(verb=verbosity-1) infiles, chunkdict = checkchuncks(sample,outdir=outdir,channel=channel,tag=tag,jobs=jobs, checkqueue=checkqueue,das=checkdas,verb=verbosity) nevents = sample.jobcfg['nevents'] # updated in checkchuncks else: # first-time submission infiles = sample.getfiles(verb=verbosity-1) if checkdas: nevents = sample.getnevents() chunkdict = { } if args.testrun: infiles = infiles[:2] if verbosity==1: print ">>> %-12s = %s"%('nfilesperjob',nfilesperjob_) print ">>> %-12s = %s"%('nfiles',len(infiles)) elif verbosity>=2: print ">>> %-12s = %s"%('nfilesperjob',nfilesperjob_) print ">>> %-12s = %s"%('nfiles',len(infiles)) print ">>> %-12s = [ "%('infiles') for file in infiles: print ">>> %r"%file print ">>> ]" print ">>> %-12s = %s"%('nevents',nevents) # CHUNKS infiles.sort() # to have consistent order with resubmission chunks = [ ] # chunk indices fchunks = chunkify(infiles,nfilesperjob_) # file chunks nfiles = len(infiles) nchunks = len(fchunks) if verbosity>=1: print ">>> %-12s = %s"%('nchunks',nchunks) if verbosity>=2: print '-'*80 # WRITE JOB LIST with arguments per job if args.verbosity>=1: print ">>> Creating job list %s..."%(joblist) with open(joblist,'w') as listfile: ichunk = 0 for fchunk in fchunks: while ichunk in chunkdict: ichunk += 1 # allows for different nfilesperjob on resubmission continue jobfiles = ' '.join(fchunk) # list of input files filetag = postfix if 'skim' not in channel: filetag += "_%d"%(ichunk) jobcmd = processor if 'skim' in channel: jobcmd += " -y %s --copydir %s -t %s --jec-sys"%(era,outdir,filetag) elif 'test' in channel: jobcmd += " -o %s -t %s -i %s"%(outdir,filetag) else: jobcmd += " -y %s -c %s -M %s --copydir %s -t %s"%(era,channel,module,outdir,filetag) if prefetch: jobcmd += " -p" jobcmd += " -i %s"%(jobfiles) # add last if args.verbosity>=1: print jobcmd listfile.write(jobcmd+'\n') chunkdict[ichunk] = fchunk chunks.append(ichunk) # JSON CONFIG jobcfg = OrderedDict([ ('time',str(datetime.now())), ('group',sample.group), ('paths',sample.paths), ('name',sample.name), ('nevents',nevents), ('channel',channel), ('module',module), ('jobname',jobname), ('jobtag',jobtag), ('tag',tag), ('postfix',postfix), ('try',subtry), ('jobids',jobids), ('outdir',outdir), ('jobdir',jobdir), ('cfgdir',cfgdir), ('logdir',logdir), ('cfgname',cfgname), ('joblist',joblist), ('nfiles',nfiles), ('files',infiles), ('nfilesperjob',nfilesperjob_), #('nchunks',nchunks), ('nchunks',nchunks), ('chunks',chunks), ('chunkdict',chunkdict), ]) # YIELD yield jobcfg print #if args.testrun: # break # only run one sample if not found: print ">>> Did not find any samples." if verbosity>=1: print ">>> %-8s = %s"%('filters',filters) print ">>> %-8s = %s"%('vetoes',vetoes)