def expand_file_list(fileEntries): for fileEntry in fileEntries: if fileEntry.find("*") != -1: for file in castor.nslsl(clean_name(fileEntry)): yield "rfio:" + file['path'] else: yield fileEntry
def local_version_current(castor_file): ''' Check if the local copy of [castor_file] exists and is up to date ''' local_file = local_version(castor_file) if not os.path.exists(local_file): return False local_stat = os.stat(local_file) # Get last mod time of local file #local_mtime = time.ctime(local_stat.st_mtime) local_mtime = time.localtime(local_stat.st_mtime) local_size = local_stat.st_size # This call is memoized castor_stat = list(castor.nslsl(castor_file))[0] castor_size = castor_stat["size"] #castor_mtime = time.mktime( # unixtime_from_timestamp(castor_stat["Last modify"])) castor_mtime = castor_stat['time'] #print local_mtime, castor_mtime # Check sizes are same if local_size != castor_size: print "Local copy of", castor_file, " is the wrong size: %i != %i" % ( local_size, castor_size) return False # Check local file is newer if local_mtime < castor_mtime: print "Local copy of", castor_file, " is outdated!" print "local:", time.asctime(local_mtime), \ "castor:", time.asctime(castor_mtime) return False return True
def local_version_current(castor_file, local_directory = LOCAL_DIRECTORY): ''' Check if the local copy of [castor_file] exists and is up to date ''' local_file = local_version(castor_file, local_directory) if not os.path.exists(local_file): return False local_stat = os.stat(local_file) # Get last mod time of local file #local_mtime = time.ctime(local_stat.st_mtime) local_mtime = time.localtime(local_stat.st_mtime) local_size = local_stat.st_size # This call is memorized castor_stat = None if is_on_castor(castor_file): castor_stat = list(castor.nslsl(castor_file))[0] elif is_on_eos(castor_file): castor_stat = list(eos.lsl(castor_file))[0] else: raise ValueError("Invalid fileName = %s !!" % castor_file) castor_size = castor_stat["size"] #castor_mtime = time.mktime( # unixtime_from_timestamp(castor_stat["Last modify"])) castor_mtime = castor_stat['time'] #print local_mtime, castor_mtime # Check sizes are same if local_size != castor_size: print "Local copy of", castor_file, " is the wrong size: %i != %i"% (local_size, castor_size) return False # Check local file is newer if local_mtime < castor_mtime: print "Local copy of", castor_file, " is outdated!" print "local:", time.asctime(local_mtime), \ "castor:", time.asctime(castor_mtime) return False return True
def castor_source(directory): " Build a generator that lists file in a castor directory, sorted by time " print "<castor_source>", directory # First sort by time files = list(castor.nslsl(directory)) # Sort by time files.sort(key=lambda x: x['time']) for file_info in files: if not file_info['size']: print "Warning <castor_source>: file %s has size 0" % \ file_info['path'] yield file_info
def castor_source(directory): " Build a generator that lists file in a castor directory, sorted by time " print "<castor_source>", directory # First sort by time files = list(castor.nslsl(directory)) # Sort by time files.sort(key = lambda x: x['time']) for file_info in files: if not file_info['size']: print "Warning <castor_source>: file %s has size 0" % \ file_info['path'] yield file_info
def crabdir_source(directory): #print "Getting list of files from crab dir:", directory crab_files = list(crab.map_lfns_to_castor(crab.lfns(directory))) good_ids = set(get_crab_id(file) for file in crab_files) # Get good crab 'ids' - a tuple of the crab job, retry, and random code # Figure out what castor directory we are in so we can get all the # information. #print "getting all files" if crab_files: castor_dir = os.path.dirname(crab_files[0]) + '/' castor_files_info = castor.nslsl(castor_dir) for file_info in castor_files_info: if get_crab_id(file_info['file']) in good_ids: yield file_info
samplesToAnalyze = recoSampleDefinitionsTauIdCommissioning_7TeV['SAMPLES_TO_RUN'] if len(eventSelectionsToAnalyze) == 0: eventSelectionsToAnalyze = eventSelections.keys() print "samplesToAnalyze = %s" % samplesToAnalyze print "eventSelectionsToAnalyze = %s" % eventSelectionsToAnalyze def runCommand(commandLine): sys.stdout.write("%s\n" % commandLine) args = shlex.split(commandLine) retVal = subprocess.Popen(args, stdout = subprocess.PIPE) retVal.wait() return retVal # find and delete "bad" files files = [ file_info for file_info in castor.nslsl(harvestingFilePath) ] for file in files: if file['size'] < 1000: runCommand("%s %s" % (executable_rfrm, file['path'])) #-------------------------------------------------------------------------------- # # build config files for running FWLiteTauFakeRateAnalyzer macro on lxbatch # fileNames_FWLiteTauFakeRateAnalyzer = {} bsubJobNames_FWLiteTauFakeRateAnalyzer = {} bjobListFileNames_FWLiteTauFakeRateAnalyzer = {} for sampleToAnalyze in samplesToAnalyze: fileNames_FWLiteTauFakeRateAnalyzer[sampleToAnalyze] = {} bsubJobNames_FWLiteTauFakeRateAnalyzer[sampleToAnalyze] = {} bjobListFileNames_FWLiteTauFakeRateAnalyzer[sampleToAnalyze] = {}
return retVal.stdout.readlines() def format_vstring(list_of_strings): retVal = "" for i, string_i in enumerate(list_of_strings): if i > 0: retVal += " " retVal += string_i return retVal inputFileNames = [] if inputFilePath.find('/castor/') != -1: inputFileNames = [ '%s' % file_info['path'] for file_info in castor.nslsl(inputFilePath) ] elif inputFilePath.find("/store") != -1: inputFileNames = [ file_info['path'] for file_info in eos.lsl(inputFilePath) ] else: inputFileNames = [ '%s' % os.path.join(inputFilePath, file_name) for file_name in os.listdir(inputFilePath) ] #print "inputFileNames = %s" % inputFileNames inputFileNames_matched = [] for inputFileName in inputFileNames:
if sample_type == 'Z': inputFilePath = '/data1/veelken/CMSSW_5_2_x/skims/genHtautauLeptonPairAcc/user/veelken/CMSSW_5_2_x/skims/' inputFile_regex = \ r"[a-zA-Z0-9_/:.]*genTauLeptonsPairAccSkim_ZplusJets_%s_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root" % channel elif sample_type == 'Higgs': inputFilePath = '/data1/veelken/CMSSW_5_2_x/skims/genHtautauLeptonPairAcc/user/v/veelken/CMSSW_5_2_x/skims/' inputFile_regex = \ r"[a-zA-Z0-9_/:.]*genTauLeptonsPairAccSkim_(ggHiggs|ggPhi|vbfHiggs)%s_%s_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root" % (massPoint, channel) else: raise ValueError("Invalid sample type = %s !!" % sample_type) # check if name of inputFile matches regular expression inputFileNames = [] files = None if inputFilePath.startswith('/castor/'): files = [ "".join([ "rfio:", file_info['path'] ]) for file_info in castor.nslsl(inputFilePath) ] elif inputFilePath.startswith('/store/'): files = [ file_info['path'] for file_info in eos.lsl(inputFilePath) ] else: files = [ "".join([ "file:", inputFilePath, file ]) for file in os.listdir(inputFilePath) ] for file in files: #print "file = %s" % file inputFile_matcher = re.compile(inputFile_regex) if inputFile_matcher.match(file): inputFileNames.append(file) #print "inputFileNames = %s" % inputFileNames process.source.fileNames = cms.untracked.vstring(inputFileNames) #-------------------------------------------------------------------------------- process.testSVfitTrackLikelihoodProductionSequence = cms.Sequence()
# (to avoid exception from castor that inputFilePath does not exists) if not evtSel in recoSampleDefinitionsTauIdCommissioning_7TeV['RECO_SAMPLES'][sample]['jobs']: continue inputFilePath = os.path.join(castorFilePath, evtSel, version, sample) + '/' # CV: add trailing '/' outputFilePath = inputFilePath print "harvesting files in inputFilePath = %s," % inputFilePath \ + " copying harvested files to outputFilePath = %s..." % outputFilePath plot_regex = r"dont match anything" skim_regex = r"%s" % recoSampleDefinitionsTauIdCommissioning_7TeV['ROOT_FILE_NAMES'][evtSel].replace( ".root", "_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root") if deleteOldHarvestFiles: print "deleting old harvest files..." files = [ file_info['path'] for file_info in castor.nslsl(outputFilePath) ] harvest_regex = r"[a-zA-Z0-9_/:.]*skim__%s_%s_%s_chunk_(?P<jobId>\d*)_(?P<hash>[a-zA-Z0-9]*).root" % (sample, evtSel, version) harvest_regex_matcher = re.compile(harvest_regex) for file in files: if harvest_regex_matcher.match(file): print "deleting file = %s" % file os.system('rfrm %s' % file) def matches_either(files): # Check if the file matches either of the regexes we are interested in. # We do this to skip extra files in the directories before we pass them to # clean_by_crab_id skim_matcher = re.compile(skim_regex) for file in files: #print " unmatched file: %s" % file['path'] if skim_matcher.match(file['file']):
bsubJobNames = {} for sampleToAnalyze in samplesToAnalyze: print "checking sample %s" % sampleToAnalyze bsubFileNames[sampleToAnalyze] = {} bsubScriptFileNames[sampleToAnalyze] = {} bsubJobNames[sampleToAnalyze] = {} inputFilePath = samples[sampleToAnalyze]['skimFilePath'] print " inputFilePath = %s" % inputFilePath inputFileNames = None if inputFilePath.find("/castor") != -1: inputFileNames = [ file_info['path'] for file_info in castor.nslsl(inputFilePath) ] elif inputFilePath.find("/store") != -1: inputFileNames = [ file_info['path'] for file_info in eos.lsl(inputFilePath) ] else: inputFileNames = [file for file in os.listdir(inputFilePath)] #print " inputFileNames = %s" % inputFileNames inputFileNames_matched = [ os.path.basename(input_file) for input_file in input_mapper(inputFileNames, sampleToAnalyze) ] #print "inputFileNames_matched = %s" % inputFileNames_matched print "--> found %i inputFiles" % len(inputFileNames_matched)
if metResolution is not None: retVal = "MEtRes%1.0f" % metResolution retVal = retVal.replace(".", "_") return retVal # CV: fill mapping of fileName to number of events contained in file into temporary cache # in order to reduce castor file I/O print "initializing mapping of fileNames to number of events contained in each file..." numEventsMap = {} fileNamesToMap = [] for sampleToAnalyze in samplesToAnalyze: for channelToAnalyze in channelsToAnalyze: inputFilePath_channel = os.path.join(inputFilePath, version, channelToAnalyze) inputFileNames = None if inputFilePath_channel.find('/castor/') != -1: inputFileNames = [ file_info['path'] for file_info in castor.nslsl(inputFilePath_channel) ] else: inputFileNames = os.listdir(inputFilePath_channel) for inputFileName in inputFileNames: if inputFileName.find("".join(['_', sampleToAnalyze, '_'])) != -1 or \ inputFileName.find("".join(['/', sampleToAnalyze, '_'])) != -1: fileNamesToMap.append(inputFileName) # CV: request inputFiles located on castor to be prestaged # in order to speed-up computation of numbers of events contained in each file # by 'buildConfigFile_SVfitEventHypothesisAnalyzer' function later if inputFilePath_channel.find('/castor/') != -1: commandLine = '%s -M %s -U myfiles' % (executable_stager, inputFileName) runCommand(commandLine) print " done." #--------------------------------------------------------------------------------
import subprocess channel = 'ZtoMuTau_tauIdEff' configFile = 'produceTauPtResPATTuple_cfg.py' analysisFilePath = getAnalysisFilePath(channel) jobId = '2011Aug18' version = 'V2exp' samplesToAnalyze = ['Ztautau_powheg'] outputFilePath = "/castor/cern.ch/user/v/veelken/CMSSW_4_2_x/PATtuples/TauPtRes/V2exp/" # Get all the skim files from the castor directory skimFilePath = getBatchHarvestLocation(channel) skim_files = [file_info['path'] for file_info in castor.nslsl(skimFilePath)] if not os.path.isdir("lxbatch_pattuple"): print 'Creating directory to store the lxbatch jobs: lxbatch_pattuple' os.mkdir('lxbatch_pattuple') if not os.path.isdir("lxbatch_pat_log"): print 'Creating directory to store the lxbatch logs: lxbatch_pat_log' os.mkdir('lxbatch_pat_log') inputFile_regex = \ r"[a-zA-Z0-9_/:.]*skim_ZtoMuTau_tauIdEff_(?P<sample>\w+)_%s_chunk_(?P<gridJob>\d*)_(?P<gridTry>\d*).root" % jobId inputFile_matcher = re.compile(inputFile_regex) # Function that maps a sample name to its skim file
def submitAnalysisToLXBatch(configFile=None, channel=None, samples=None, samplesToAnalyze=None, samplesToSkip=None, disableFactorization=False, disableSysUncertainties=False, disableZrecoilCorrections=False, script_directory=None, cfgdir='lxbatch', inputFileMap=None, outputFileMap=None, outputDirectory=None, queue='1nd', enableEventDumps=False, enableFakeRates=False, processName=None, changeTauId=None, saveFinalEvents=False, jobExtention=''): """ Submit analysis job (event selection, filling of histogram) to local machine """ # check that configFile, channel, samples and jobId # parameters are defined and non-empty for param in ["configFile", "channel", "samples", "outputDirectory"]: if locals()[param] is None: raise ValueError("Undefined '%s' parameter!!" % param) jobId = reg.getJobId(channel) # If not specified take script directory from user preferences. if script_directory is None: script_directory = reg.getHarvestScriptLocation() # Make sure our output file for the scripts is okay if not os.path.exists(script_directory): os.makedirs(script_directory) # Get all the files in our output directory that have non-zero size tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory) if x['size']) # Keep track of the files we care about relevant_files = set([]) submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh' with open(submit_file_name, 'w') as submit_file: # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: write_comment_header(submit_file, " Sample: " + sample) # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = {'channel': channel, 'sample': sample, 'id': jobId} # Now build the scripts to feed to bsub # Find the input files input_files = list(inputFileMap(channel, sample, jobId)) if len(input_files) > 0: print("Submitting %s in %i part(s)" % (sample, len(input_files))) else: print("No local input files for %s found !!" % sample) for job, file in enumerate(input_files): input_files = [file] # The None in the tuple indicates this file has no dependencies in # the batch job. input_files_and_jobs = [(None, file) for file in input_files] # Need to prepend file:, and strip off the directory since we # always have bsub rfcp the input files to the working # directory. input_files_for_cfgOptions = [ 'file:' + os.path.basename(file) for file in input_files ] output_file = outputFileMap(channel, sample, jobId) input_file_hash = jobtools.hash_files(input_files, add_time=False) # Add the hash of the input file so we know the provenance of all # files output_file = os.path.join( outputDirectory, output_file.replace( '.root', '_' + str(job) + '_' + input_file_hash + '.root')) relevant_files.add(os.path.basename(output_file)) # Uncomment to skip rerunning of old jobs #if os.path.basename(output_file) in tmp_files: #print " done; skipping", output_file #continue # First, prepare the configuration file newConfigFile = getNewConfigFileName(configFile, cfgdir, sample, jobId, index=job, label="@lxbatch") write_comment_header(submit_file, " cfg: " + newConfigFile) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement jobCustomizations = [] jobCustomizations.append( "if hasattr(process, 'ntupleOutputModule'):") jobCustomizations.append( " process.ntupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append( "if hasattr(process, 'patTupleOutputModule'):") jobCustomizations.append( " process.patTupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append( "if hasattr(process, 'skimOutputModule'):") jobCustomizations.append( " process.skimOutputModule.fileName = '%s'" % os.path.basename(output_file)) HLTprocessName = 'HLT' if 'hlt' in samples['RECO_SAMPLES'][sample].keys(): HLTprocessName = samples['RECO_SAMPLES'][sample][ 'hlt'].getProcessName() jobCustomizations.append("if hasattr(process, 'hltMu'):") jobCustomizations.append( " process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName) jobCustomizations.append( "if hasattr(process, 'patTrigger'):") jobCustomizations.append( " process.patTrigger.processName = '%s'" % HLTprocessName) jobCustomizations.append( "if hasattr(process, 'patTriggerEvent'):") jobCustomizations.append( " process.patTriggerEvent.processName = '%s'" % HLTprocessName) if samples['RECO_SAMPLES'][sample]['type'] == 'Data': jobCustomizations.append( "if hasattr(process, 'prePatProductionSequence')" + " and hasattr(process, 'prePatProductionSequenceGen'):" ) jobCustomizations.append( " process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)" ) jobCustomizations.append( "if hasattr(process, 'ntupleProducer'):") jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'tauGenJets'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'tauGenJets')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genJets'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genJets')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')" ) jobCustomizations.append( "if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):" ) jobCustomizations.append( " process.patDefaultSequence.replace(process.patTriggerEventSequence," ) jobCustomizations.append( " process.patTriggerSequence + process.patTriggerEventSequence)" ) #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile=configFile, jobInfo=jobInfo, newConfigFile=newConfigFile, sample_infos=samples, disableFactorization=disableFactorization, disableSysUncertainties=disableSysUncertainties, disableZrecoilCorrections=disableZrecoilCorrections, # We always copy the input files to the local directory # before running cmsRun, so just take the basname input_files=input_files_for_cfgOptions, output_file=os.path.basename(output_file), enableEventDumps=enableEventDumps, enableFakeRates=enableFakeRates, processName=processName, saveFinalEvents=saveFinalEvents, changeTauId=changeTauId, customizations=jobCustomizations) # Build a function that constructs our log file name given the # job file hash. if not os.path.exists('lxbatch_log'): os.makedirs('lxbatch_log') def log_file_maker(job_hash): return os.path.join( 'lxbatch_log', "_".join(['run', channel, sample, jobId, job_hash]) + '.log') # Build our batch job jobname, script = jobtools.make_bsub_script( output_file, input_files_and_jobs, log_file_maker, "cmsRun %s" % newConfigFile, pass_io_files=False) bsub_script_file = os.path.join( script_directory, "_".join([ 'analyze' + jobExtention, sample, 'job', str(job), input_file_hash ]) + '.sh') with open(bsub_script_file, 'w') as bsub_script: bsub_script.write(script) # Add this bsub to our submission script submit_file.write("bsub -q %s < %s\n" % (queue, bsub_script_file)) print len(tmp_files) garbage = tmp_files - relevant_files print len(garbage) if garbage: print "Found %i files not generated by this job!!" % len(garbage) print " You should really run:" print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm" with open('ana_garbage.txt', 'w') as garbage_script: for file in garbage: garbage_script.write('%s\n' % os.path.join(outputDirectory, file)) print "Run ./%s to submit jobs" % submit_file_name os.chmod(submit_file_name, 0755) return submit_file_name
def buildConfigFile_FWLiteTauFakeRateAnalyzer(sampleToAnalyze, evtSel, version, inputFilePath, tauIds, tauJetCandSelection, srcTauJetCandidates, srcMET, intLumiData, hltPaths, srcWeights, configFilePath, logFilePath, outputFilePath, recoSampleDefinitions): """Build cfg.py file to run FWLiteTauFakeRateAnalyzer macro to run on PAT-tuples, and fill histograms for passed/failed samples""" print "inputFilePath = %s" % inputFilePath inputFileNames = None if inputFilePath.find('/castor/') != -1: inputFileNames = [ file_info['path'] for file_info in castor.nslsl(inputFilePath) ] else: inputFileNames = os.listdir(inputFilePath) #print "inputFileNames = %s" % inputFileNames # check if inputFile is PAT-tuple and # matches sampleToAnalyze, jobId inputFileNames_sample = [] for inputFileName in inputFileNames: if inputFileName.find("chunk") != -1 and \ inputFileName.find("".join(['_', sampleToAnalyze, '_'])) != -1: # CV: assume that input file gets copied to local directory before FWLiteTauFakeRateAnalyzer macro gets started inputFileNames_sample.append(os.path.basename(inputFileName)) #print(sampleToAnalyze) #print(inputFiles_sample) if len(inputFileNames_sample) == 0: print("Sample %s, evtSel = %s has no input files --> skipping !!" % (sampleToAnalyze, evtSel)) return # find name of associated "process" process_matched = None processes = recoSampleDefinitions['MERGE_SAMPLES'].keys() for process in processes: for sample in recoSampleDefinitions['MERGE_SAMPLES'][process]['samples']: if sample == sampleToAnalyze: process_matched = process if not process_matched: print("No process associated to sample %s --> skipping !!" % sampleToAnalyze) return print("building config file(s) for sample %s, evtSel %s..." % (sampleToAnalyze, evtSel)) processType = recoSampleDefinitions['RECO_SAMPLES'][sampleToAnalyze]['type'] tauIds_string = make_tauIds_string(tauIds) hltPaths_string = None if isinstance(hltPaths, dict): hltPaths_string = make_inputFileNames_vstring(hltPaths[processType]) else: hltPaths_string = make_inputFileNames_vstring(hltPaths) weights_string = make_inputFileNames_vstring(srcWeights[processType]) configFileNames = [] outputFileNames = [] logFileNames = [] for inputFileName_sample in inputFileNames_sample: inputFileName_regex = r"[a-zA-Z0-9_./]*skim_(?P<sample>\w+?)_chunk_(?P<jobId>\d*)_(?P<hash>[a-zA-Z0-9]*).root" inputFileName_matcher = re.compile(inputFileName_regex) match = inputFileName_matcher.match(inputFileName_sample) if not match: raise ValueError("Failed to parse fileName = %s !!" % inputFileName_sample) jobId = match.group('jobId') outputFileName = 'analyzeTauFakeRateHistograms_%s_%s_%s_chunk_%s.root' % (evtSel, sampleToAnalyze, version, jobId) allEvents_DBS = -1 xSection = 0.0 if not recoSampleDefinitions['MERGE_SAMPLES'][process_matched]['type'] == 'Data': allEvents_DBS = recoSampleDefinitions['RECO_SAMPLES'][sampleToAnalyze]['events_processed'] xSection = recoSampleDefinitions['RECO_SAMPLES'][sampleToAnalyze]['x_sec'] config = \ """ import FWCore.ParameterSet.Config as cms process = cms.PSet() process.fwliteInput = cms.PSet( fileNames = cms.vstring('%s'), maxEvents = cms.int32(-1), outputEvery = cms.uint32(1000) ) process.fwliteOutput = cms.PSet( fileName = cms.string('%s') ) process.tauFakeRateAnalyzer = cms.PSet( process = cms.string('%s'), type = cms.string('%s'), evtSel = cms.string('%s'), regions = cms.vstring( 'P', 'F', 'A' ), tauIds = cms.VPSet( %s ), srcTauJetCandidates = cms.InputTag('%s'), tauJetCandSelection = cms.vstring( %s ), srcTrigger = cms.InputTag('patTriggerEvent'), hltPaths = cms.vstring(%s), srcMET = cms.InputTag('%s'), srcVertices = cms.InputTag('selectedPrimaryVertexPosition'), weights = cms.VInputTag(%s), # CV: 'srcEventCounter' is defined in TauAnalysis/Skimming/test/skimTauIdEffSample_cfg.py srcEventCounter = cms.InputTag('totalEventsProcessed'), allEvents_DBS = cms.int32(%i), xSection = cms.double(%f), intLumiData = cms.double(%f), srcLumiProducer = cms.InputTag('lumiProducer') ) """ % (inputFileName_sample, outputFileName, process_matched, processType, evtSel, tauIds_string, srcTauJetCandidates, tauJetCandSelection, hltPaths_string, srcMET, weights_string, allEvents_DBS, xSection, intLumiData) outputFileNames.append(outputFileName) configFileName = "analyzeTauFakeRatePATtuple_%s_%s_%s_cfg.py" % (evtSel, sampleToAnalyze, jobId) configFileName_full = os.path.join(configFilePath, configFileName) configFile = open(configFileName_full, "w") configFile.write(config) configFile.close() configFileNames.append(configFileName) logFileName = configFileName.replace('_cfg.py', '.log') logFileName_full = os.path.join(logFilePath, logFileName) logFileNames.append(logFileName) retVal = {} retVal['inputFileNames'] = inputFileNames_sample retVal['configFileNames'] = configFileNames retVal['outputFileNames'] = outputFileNames retVal['logFileNames'] = logFileNames #print " inputFileNames = %s" % inputFileNames_sample #print " configFileNames = %s" % configFileNames #print " outputFileNames = %s" % outputFileNames #print " logFileNames = %s" % logFileNames return retVal
def buildConfigFile_FWLiteTauFakeRateAnalyzer( sampleToAnalyze, evtSel, version, inputFilePath, tauIds, tauJetCandSelection, srcTauJetCandidates, srcMET, intLumiData, hltPaths, srcWeights, configFilePath, logFilePath, outputFilePath, recoSampleDefinitions): """Build cfg.py file to run FWLiteTauFakeRateAnalyzer macro to run on PAT-tuples, and fill histograms for passed/failed samples""" print "inputFilePath = %s" % inputFilePath inputFileNames = None if inputFilePath.find('/castor/') != -1: inputFileNames = [ file_info['path'] for file_info in castor.nslsl(inputFilePath) ] else: inputFileNames = os.listdir(inputFilePath) #print "inputFileNames = %s" % inputFileNames # check if inputFile is PAT-tuple and # matches sampleToAnalyze, jobId inputFileNames_sample = [] for inputFileName in inputFileNames: if inputFileName.find("chunk") != -1 and \ inputFileName.find("".join(['_', sampleToAnalyze, '_'])) != -1: # CV: assume that input file gets copied to local directory before FWLiteTauFakeRateAnalyzer macro gets started inputFileNames_sample.append(os.path.basename(inputFileName)) #print(sampleToAnalyze) #print(inputFiles_sample) if len(inputFileNames_sample) == 0: print("Sample %s, evtSel = %s has no input files --> skipping !!" % (sampleToAnalyze, evtSel)) return # find name of associated "process" process_matched = None processes = recoSampleDefinitions['MERGE_SAMPLES'].keys() for process in processes: for sample in recoSampleDefinitions['MERGE_SAMPLES'][process][ 'samples']: if sample == sampleToAnalyze: process_matched = process if not process_matched: print("No process associated to sample %s --> skipping !!" % sampleToAnalyze) return print("building config file(s) for sample %s, evtSel %s..." % (sampleToAnalyze, evtSel)) processType = recoSampleDefinitions['RECO_SAMPLES'][sampleToAnalyze][ 'type'] tauIds_string = make_tauIds_string(tauIds) hltPaths_string = None if isinstance(hltPaths, dict): hltPaths_string = make_inputFileNames_vstring(hltPaths[processType]) else: hltPaths_string = make_inputFileNames_vstring(hltPaths) weights_string = make_inputFileNames_vstring(srcWeights[processType]) configFileNames = [] outputFileNames = [] logFileNames = [] for inputFileName_sample in inputFileNames_sample: inputFileName_regex = r"[a-zA-Z0-9_./]*skim_(?P<sample>\w+?)_chunk_(?P<jobId>\d*)_(?P<hash>[a-zA-Z0-9]*).root" inputFileName_matcher = re.compile(inputFileName_regex) match = inputFileName_matcher.match(inputFileName_sample) if not match: raise ValueError("Failed to parse fileName = %s !!" % inputFileName_sample) jobId = match.group('jobId') outputFileName = 'analyzeTauFakeRateHistograms_%s_%s_%s_chunk_%s.root' % ( evtSel, sampleToAnalyze, version, jobId) allEvents_DBS = -1 xSection = 0.0 if not recoSampleDefinitions['MERGE_SAMPLES'][process_matched][ 'type'] == 'Data': allEvents_DBS = recoSampleDefinitions['RECO_SAMPLES'][ sampleToAnalyze]['events_processed'] xSection = recoSampleDefinitions['RECO_SAMPLES'][sampleToAnalyze][ 'x_sec'] config = \ """ import FWCore.ParameterSet.Config as cms process = cms.PSet() process.fwliteInput = cms.PSet( fileNames = cms.vstring('%s'), maxEvents = cms.int32(-1), outputEvery = cms.uint32(1000) ) process.fwliteOutput = cms.PSet( fileName = cms.string('%s') ) process.tauFakeRateAnalyzer = cms.PSet( process = cms.string('%s'), type = cms.string('%s'), evtSel = cms.string('%s'), regions = cms.vstring( 'P', 'F', 'A' ), tauIds = cms.VPSet( %s ), srcTauJetCandidates = cms.InputTag('%s'), tauJetCandSelection = cms.vstring( %s ), srcTrigger = cms.InputTag('patTriggerEvent'), hltPaths = cms.vstring(%s), srcMET = cms.InputTag('%s'), srcVertices = cms.InputTag('selectedPrimaryVertexPosition'), weights = cms.VInputTag(%s), # CV: 'srcEventCounter' is defined in TauAnalysis/Skimming/test/skimTauIdEffSample_cfg.py srcEventCounter = cms.InputTag('totalEventsProcessed'), allEvents_DBS = cms.int32(%i), xSection = cms.double(%f), intLumiData = cms.double(%f), srcLumiProducer = cms.InputTag('lumiProducer') ) """ % (inputFileName_sample, outputFileName, process_matched, processType, evtSel, tauIds_string, srcTauJetCandidates, tauJetCandSelection, hltPaths_string, srcMET, weights_string, allEvents_DBS, xSection, intLumiData) outputFileNames.append(outputFileName) configFileName = "analyzeTauFakeRatePATtuple_%s_%s_%s_cfg.py" % ( evtSel, sampleToAnalyze, jobId) configFileName_full = os.path.join(configFilePath, configFileName) configFile = open(configFileName_full, "w") configFile.write(config) configFile.close() configFileNames.append(configFileName) logFileName = configFileName.replace('_cfg.py', '.log') logFileName_full = os.path.join(logFilePath, logFileName) logFileNames.append(logFileName) retVal = {} retVal['inputFileNames'] = inputFileNames_sample retVal['configFileNames'] = configFileNames retVal['outputFileNames'] = outputFileNames retVal['logFileNames'] = logFileNames #print " inputFileNames = %s" % inputFileNames_sample #print " configFileNames = %s" % configFileNames #print " outputFileNames = %s" % outputFileNames #print " logFileNames = %s" % logFileNames return retVal
def buildConfigFile_SVfitEventHypothesisAnalyzer(sampleToAnalyze, channelToAnalyze, metResolution, configFileName_template, inputFilePath, numInputFilesPerJob, maxEventsPerJob, configFilePath, logFilePath, outputFilePath, numEventsMap = None): """Build cfg.py file to run SVfit algorithm and fill histograms of SVfit reconstructed mass""" #print "inputFilePath = %s" % inputFilePath inputFileNames = None if inputFilePath.find('/castor/') != -1: inputFileNames = [ file_info['path'] for file_info in castor.nslsl(inputFilePath) ] else: inputFileNames = os.listdir(inputFilePath) #print "inputFileNames = %s" % inputFileNames # check if inputFile matches sampleToAnalyze inputFileNames_sample = [] for inputFileName in inputFileNames: if inputFileName.find("".join(['_', sampleToAnalyze, '_'])) != -1 or \ inputFileName.find("".join(['/', sampleToAnalyze, '_'])) != -1: # CV: assume that input file gets copied to local directory before cmsRun gets started inputFileNames_sample.append(os.path.basename(inputFileName)) #print(sampleToAnalyze) #print(inputFileNames_sample) if len(inputFileNames_sample) == 0: print("Sample %s, channel = %s has no input files --> skipping !!" % (sampleToAnalyze, channelToAnalyze)) return # CV: restrict the number of input files to 50 in order to balance event statistics # (and computing time) for different mass-points if len(inputFileNames_sample) > 50: inputFileNames_sample = inputFileNames_sample[0:50] numInputFiles = len(inputFileNames_sample) numInputFileGroups = (numInputFiles / numInputFilesPerJob) if (numInputFiles % numInputFilesPerJob) != 0: numInputFileGroups = numInputFileGroups + 1 inputFileNameGroups_sample = [] skipEvents_sample = [] for fileId in range(numInputFileGroups): inputFileIdx_first = fileId*numInputFilesPerJob inputFileIdx_last = inputFileIdx_first + numInputFilesPerJob if inputFileIdx_last > len(inputFileNames_sample): inputFileIdx_last = len(inputFileNames_sample) #print "inputFileIdx: first = %i, last = %i" % (inputFileIdx_first, inputFileIdx_last) numEvents = 0 for inputFileIdx in range(inputFileIdx_first, inputFileIdx_last): inputFileName = inputFileNames_sample[fileId] numEvents_i = None if numEventsMap is not None and numEventsMap.has_key(os.path.basename(inputFileName)): numEvents_i = numEventsMap[inputFileName] else: numEvents_i = getNumEvents(os.path.join(inputFilePath, inputFileNames_sample[fileId])) numEvents = numEvents + numEvents_i numJobsPerGroup = (numEvents / maxEventsPerJob) if (numEvents % maxEventsPerJob) != 0: numJobsPerGroup = numJobsPerGroup + 1 print "group of inputFiles = %s contains %i events --> splitting into %i jobs." % \ (inputFileNames_sample[inputFileIdx_first:inputFileIdx_last], numEvents, numJobsPerGroup) for jobId in range(numJobsPerGroup): inputFileNameGroups_sample.append(inputFileNames_sample[inputFileIdx_first:inputFileIdx_last]) skipEvents_sample.append(jobId*maxEventsPerJob) #print "inputFileNameGroups_sample = %s" % inputFileNameGroups_sample configFileNames = [] outputFileNames = [] logFileNames = [] numJobs = len(inputFileNameGroups_sample) for jobId in range(numJobs): inputFileNames_string = "[ " for inputFileName_sample in inputFileNameGroups_sample[jobId]: inputFileNames_string += "'file:%s', " % inputFileName_sample inputFileNames_string += " ]" sample_type = None sample_type_Z_regex = "[Ztautau|ZplusJets|ZToTauTau]" sample_type_Z_matcher = re.compile(sample_type_Z_regex) sample_type_Higgs_regex = "(((gg|bb|vbf)(Higgs|Phi))|HToTauTau_M-)[0-9]+" sample_type_Higgs_matcher = re.compile(sample_type_Higgs_regex) if sample_type_Z_matcher.match(sampleToAnalyze): sample_type = 'Z' elif sample_type_Higgs_matcher.match(sampleToAnalyze): sample_type = 'Higgs' else: raise ValueError("Failed to determine wether sample = %s is Z or Higgs sample !!" % sampleToAnalyze) metResolution_string = "None" metResolution_label = "pfMEtResMC" if metResolution is not None: metResolution_string = "%f" % metResolution metResolution_label = "pfMEtRes%1.0f" % metResolution metResolution_label = metResolution_label.replace(".", "_") outputFileName = 'svFitPerformanceAnalysisPlots_%s_%s_%s_%i.root' % \ (sampleToAnalyze, channelToAnalyze, metResolution_label, jobId + 1) outputFileNames.append(outputFileName) replacements = [] replacements.append([ 'sample', "'%s'" % sampleToAnalyze ]) replacements.append([ 'sample_type', "'%s'" % sample_type ]) replacements.append([ 'channel', "'%s'" % channelToAnalyze ]) replacements.append([ 'metResolution', "%s" % metResolution_string ]) replacements.append([ 'skipEvents', "%i" % skipEvents_sample[jobId] ]) replacements.append([ 'maxEvents', "%i" % maxEventsPerJob ]) replacements.append([ 'inputFileNames', "%s" % inputFileNames_string ]) replacements.append([ 'outputFileName', "'%s'" % outputFileName ]) configFileName = "svFitPerformanceAnalysisPlots_%s_%s_%s_%i_cfg.py" % \ (sampleToAnalyze, channelToAnalyze, metResolution_label, jobId) configFileName_full = os.path.join(configFilePath, configFileName) replaceConfigFileParam(configFileName_template, configFileName_full, replacements) configFileNames.append(configFileName) logFileName = configFileName.replace('_cfg.py', '.log') logFileName_full = os.path.join(logFilePath, logFileName) logFileNames.append(logFileName) retVal = {} retVal['inputFileNames'] = inputFileNameGroups_sample retVal['configFileNames'] = configFileNames retVal['outputFileNames'] = outputFileNames retVal['logFileNames'] = logFileNames #print " inputFileNames = %s" % inputFileNames_sample #print " configFileNames = %s" % configFileNames #print " outputFileNames = %s" % outputFileNames #print " logFileNames = %s" % logFileNames return retVal
mode = None if inputFilePath.find('/castor/') == 0: mode = 'castor' if inputFilePath.find('/store/') == 0: mode = 'eos' else: mode = 'local' if jobId is None: reg.overrideJobId(channel, '2011Oct30') # CV: need to overwrite this in order to match Mauro's filenames jobId = reg.getJobId(channel) print(" jobId = %s" % jobId) if mode == 'castor': files = [ file_info for file_info in castor.nslsl(inputFilePath) ] elif mode == 'eos': files = [ file_info for file_info in eos.lsl(inputFilePath) ] else: commandLine = '%s %s' % (options['executable_ls'][mode], inputFilePath) args = shlex.split(commandLine) retval = subprocess.Popen(args, stdout = subprocess.PIPE) #retval.wait() files = retval.stdout.read().split('\n') #print(" files = %s" % files) fileName_regex = r"(?P<fileName_base>[a-zA-Z0-9_]+)_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root" fileName_matcher = re.compile(fileName_regex) fileNamesAndProperties_dict = {}
# read list of files existing in output file path if checkJobOutputFiles: outputFilePath = outputFilePath_prefix if outputFilePath_suffix: if not (outputFilePath.endswith('/') or outputFilePath_suffix.startswith('/')): outputFilePath += '/' outputFilePath += outputFilePath_suffix if not outputFilePath.endswith('/'): outputFilePath += '/' if outputFilePath.find("/castor/") != -1: print("checking castor files in outputFilePath = %s" % outputFilePath) outputFileInfos = [ outputFileInfo for outputFileInfo in castor.nslsl(outputFilePath) ] elif outputFilePath.find("/dpm/") != -1: if publish_data: datasetpath_items = datasetpath.split('/') for idx in range(len(datasetpath_items)): if len(datasetpath_items[idx]) > 0: outputFilePath += datasetpath_items[idx] if not outputFilePath.endswith('/'): outputFilePath += '/' break if not publish_data_name: raise ValueError( "Invalid 'publish_data_name' = %s !!" % publish_data_name) outputFilePath += publish_data_name
if not numJobs: raise ValueError("Failed to read number of jobs from log file %s !!" % crabLogFileName) print "numJobs = %i" % numJobs # read list of files existing in output file path if checkJobOutputFiles: outputFilePath = outputFilePath_prefix if outputFilePath_suffix: if not (outputFilePath.endswith('/') or outputFilePath_suffix.startswith('/')): outputFilePath += '/' outputFilePath += outputFilePath_suffix if not outputFilePath.endswith('/'): outputFilePath += '/' if outputFilePath.find("/castor/") != -1: print("checking castor files in outputFilePath = %s" % outputFilePath) outputFileInfos = [ outputFileInfo for outputFileInfo in castor.nslsl(outputFilePath) ] elif outputFilePath.find("/dpm/") != -1: if publish_data: datasetpath_items = datasetpath.split('/') for idx in range(len(datasetpath_items)): if len(datasetpath_items[idx]) > 0: outputFilePath += datasetpath_items[idx] if not outputFilePath.endswith('/'): outputFilePath += '/' break if not publish_data_name: raise ValueError("Invalid 'publish_data_name' = %s !!" % publish_data_name) outputFilePath += publish_data_name if not outputFilePath.endswith('/'): outputFilePath += '/' print("checking DPM files in outputFilePath = %s" % outputFilePath)
#!/usr/bin/env python import TauAnalysis.Configuration.tools.castor as castor import TauAnalysis.TauIdEfficiency.tools.castor_mirror2 as castor_mirror import subprocess import shlex # Get all the skim files from the castor directory sourceFilePath = "/castor/cern.ch/user/v/veelken/CMSSW_4_2_x/PATtuples/TauPtRes/V2exp/" source_files = [ file_info['path'] for file_info in castor.nslsl(sourceFilePath) ] targetFilePath = "/data2/veelken/CMSSW_4_2_x/PATtuples/TauPtRes/V2exp/" jobId = "2011Aug18" version = "V2exp" samplesToCopy = [ # modify in case you want to submit jobs for some of the samples only... ] files_to_copy = [] for source_file in source_files: if source_file.find("%s%s" % (jobId, version)) == -1: continue isSampleToCopy = False
process = cms.PSet() process.fwliteInput = cms.PSet( fileNames = cms.vstring(), maxEvents = cms.int32(-1), outputEvery = cms.uint32(1000) ) #-------------------------------------------------------------------------------- inputFilePath = '/data1/veelken/CMSSW_4_2_x/Ntuples/user/v/veelken/CMSSW_4_2_x/Ntuples/neuralMtautauTraining/v1_5' inputFileNames = [] if inputFilePath.find('/castor/') != -1: inputFileNames = [ 'rfio:%s' % file_info['path'] for file_info in castor.nslsl(inputFilePath) ] else: inputFileNames = [ 'file:%s' % os.path.join(inputFilePath, file_name) for file_name in os.listdir(inputFilePath) ] inputFile_regex = \ r"[a-zA-Z0-9_/:.]*neuralMtautauNtuple_(?P<sample>[a-zA-Z0-9_]+)_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root" inputFile_matcher = re.compile(inputFile_regex) inputFileNames_matched = [] for inputFileName in inputFileNames: if inputFile_matcher.match(inputFileName): inputFileNames_matched.append(inputFileName) #print "inputFileNames_matched = %s" % inputFileNames_matched setattr(process.fwliteInput, "fileNames", cms.vstring(inputFileNames_matched))
'data_Mu_Run2010B_Nov4ReReco' ] # If this is a list, only the items in the list will be analyzed. samplesToAnalyze = [] #samplesToAnalyze = fake_rate_samples # Where we will send the output on castor outputPath = reg.getAnalysisFilePath(channel) jobId = reg.getJobId(channel) # Figure out where our root files were stored for the desired skim skimPath = reg.getSkimEvents(channel) # Get all the skim files from the castor directory skim_files = [os.path.join(skimPath, file) for file in filter(lambda x: x.startswith('skim_'), ( file_info['file'] for file_info in castor.nslsl(skimPath)))] def inputFileMapper(channel, sample, jobId): for file in skim_files: if file.find('_' + sample + '_') != -1: yield file enableFakeRates = False enableSystematics = False changeTauId = None saveFinalEvents = False eventList = None submit.submitAnalysisToLXBatch( configFile=configFile, channel=channel,
#!/usr/bin/env python import TauAnalysis.Configuration.tools.castor as castor import TauAnalysis.TauIdEfficiency.tools.castor_mirror2 as castor_mirror import subprocess import shlex # Get all the skim files from the castor directory sourceFilePath = "/castor/cern.ch/user/v/veelken/CMSSW_4_2_x/PATtuples/TauPtRes/V2exp/" source_files = [ file_info['path'] for file_info in castor.nslsl(sourceFilePath) ] targetFilePath = "/data2/veelken/CMSSW_4_2_x/PATtuples/TauPtRes/V2exp/" jobId = "2011Aug18" version = "V2exp" samplesToCopy = [ # modify in case you want to submit jobs for some of the samples only... ] files_to_copy = [] for source_file in source_files: if source_file.find("%s%s" % (jobId, version)) == -1: continue isSampleToCopy = False if len(samplesToCopy) == 0: isSampleToCopy = True
if inputFilePath.find('/castor/') == 0: mode = 'castor' if inputFilePath.find('/store/') == 0: mode = 'eos' else: mode = 'local' if jobId is None: reg.overrideJobId( channel, '2011Oct30' ) # CV: need to overwrite this in order to match Mauro's filenames jobId = reg.getJobId(channel) print(" jobId = %s" % jobId) if mode == 'castor': files = [file_info for file_info in castor.nslsl(inputFilePath)] elif mode == 'eos': files = [file_info for file_info in eos.lsl(inputFilePath)] else: commandLine = '%s %s' % (options['executable_ls'][mode], inputFilePath) args = shlex.split(commandLine) retval = subprocess.Popen(args, stdout=subprocess.PIPE) #retval.wait() files = retval.stdout.read().split('\n') #print(" files = %s" % files) fileName_regex = r"(?P<fileName_base>[a-zA-Z0-9_]+)_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root" fileName_matcher = re.compile(fileName_regex) fileNamesAndProperties_dict = {}
] else: raise ValueError("Invalid mass-point = %i !!" % massPoint) else: raise ValueError("Invalid channel = %s !!" % channel) inputFile_regex = \ r"[a-zA-Z0-9_/:.]*genTauLeptonPairSkim_(ggHiggs|ggPhi|vbfHiggs)%i_%s_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root" % (massPoint, channel) else: raise ValueError("Invalid sample type = %s !!" % sample_type) # check if name of inputFile matches regular expression inputFileNames = [] for inputFilePath in inputFilePaths: files = None if inputFilePath.startswith('/castor/'): files = [ "".join([ "rfio:", file_info['path'] ]) for file_info in castor.nslsl(inputFilePath) ] elif inputFilePath.startswith('/store/'): files = [ file_info['path'] for file_info in eos.lsl(inputFilePath) ] else: files = [ "".join([ "file:", inputFilePath, file ]) for file in os.listdir(inputFilePath) ] for file in files: #print "file = %s" % file inputFile_matcher = re.compile(inputFile_regex) if inputFile_matcher.match(file): inputFileNames.append(file) print "inputFileNames = %s" % inputFileNames process.source.fileNames = cms.untracked.vstring(inputFileNames) #-------------------------------------------------------------------------------- process.testSVfitTrackLikelihoodProductionSequence = cms.Sequence()
def submitAnalysisToLXBatch(configFile = None, channel = None, samples = None, samplesToAnalyze = None, samplesToSkip = None, disableFactorization = False, disableSysUncertainties = False, disableZrecoilCorrections = False, script_directory=None, cfgdir = 'lxbatch', inputFileMap = None, outputFileMap = None, outputDirectory = None, queue = '1nd', enableEventDumps = False, enableFakeRates = False, processName = None, changeTauId = None, saveFinalEvents = False, jobExtention = ''): """ Submit analysis job (event selection, filling of histogram) to local machine """ # check that configFile, channel, samples and jobId # parameters are defined and non-empty for param in ["configFile", "channel", "samples", "outputDirectory"]: if locals()[param] is None: raise ValueError("Undefined '%s' parameter!!" % param) jobId = reg.getJobId(channel) # If not specified take script directory from user preferences. if script_directory is None: script_directory = reg.getHarvestScriptLocation() # Make sure our output file for the scripts is okay if not os.path.exists(script_directory): os.makedirs(script_directory) # Get all the files in our output directory that have non-zero size tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory) if x['size']) # Keep track of the files we care about relevant_files = set([]) submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh' with open(submit_file_name, 'w') as submit_file: # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: write_comment_header(submit_file, " Sample: " + sample) # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = { 'channel' : channel, 'sample' : sample, 'id' : jobId } # Now build the scripts to feed to bsub # Find the input files input_files = list(inputFileMap(channel, sample, jobId)) if len(input_files) > 0: print("Submitting %s in %i part(s)" % (sample, len(input_files))) else: print("No local input files for %s found !!" % sample) for job, file in enumerate(input_files): input_files = [file] # The None in the tuple indicates this file has no dependencies in # the batch job. input_files_and_jobs = [ (None, file) for file in input_files ] # Need to prepend file:, and strip off the directory since we # always have bsub rfcp the input files to the working # directory. input_files_for_cfgOptions = [ 'file:' + os.path.basename(file) for file in input_files] output_file = outputFileMap(channel, sample, jobId) input_file_hash = jobtools.hash_files( input_files, add_time=False) # Add the hash of the input file so we know the provenance of all # files output_file = os.path.join(outputDirectory, output_file.replace( '.root', '_' + str(job) + '_' + input_file_hash + '.root')) relevant_files.add(os.path.basename(output_file)) # Uncomment to skip rerunning of old jobs #if os.path.basename(output_file) in tmp_files: #print " done; skipping", output_file #continue # First, prepare the configuration file newConfigFile = getNewConfigFileName( configFile, cfgdir, sample, jobId, index = job, label = "@lxbatch") write_comment_header(submit_file, " cfg: " + newConfigFile) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement jobCustomizations = [] jobCustomizations.append("if hasattr(process, 'ntupleOutputModule'):") jobCustomizations.append(" process.ntupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append("if hasattr(process, 'patTupleOutputModule'):") jobCustomizations.append(" process.patTupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append("if hasattr(process, 'skimOutputModule'):") jobCustomizations.append(" process.skimOutputModule.fileName = '%s'" % os.path.basename(output_file)) HLTprocessName = 'HLT' if 'hlt' in samples['RECO_SAMPLES'][sample].keys(): HLTprocessName = samples['RECO_SAMPLES'][sample]['hlt'].getProcessName() jobCustomizations.append("if hasattr(process, 'hltMu'):") jobCustomizations.append(" process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName) jobCustomizations.append("if hasattr(process, 'patTrigger'):") jobCustomizations.append(" process.patTrigger.processName = '%s'" % HLTprocessName) jobCustomizations.append("if hasattr(process, 'patTriggerEvent'):") jobCustomizations.append(" process.patTriggerEvent.processName = '%s'" % HLTprocessName) if samples['RECO_SAMPLES'][sample]['type'] == 'Data': jobCustomizations.append("if hasattr(process, 'prePatProductionSequence')" + " and hasattr(process, 'prePatProductionSequenceGen'):") jobCustomizations.append(" process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)") jobCustomizations.append("if hasattr(process, 'ntupleProducer'):") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'tauGenJets'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'tauGenJets')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genJets'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genJets')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')") jobCustomizations.append("if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):") jobCustomizations.append(" process.patDefaultSequence.replace(process.patTriggerEventSequence,") jobCustomizations.append(" process.patTriggerSequence + process.patTriggerEventSequence)") #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile = configFile, jobInfo = jobInfo, newConfigFile = newConfigFile, sample_infos = samples, disableFactorization = disableFactorization, disableSysUncertainties = disableSysUncertainties, disableZrecoilCorrections = disableZrecoilCorrections, # We always copy the input files to the local directory # before running cmsRun, so just take the basname input_files = input_files_for_cfgOptions, output_file = os.path.basename(output_file), enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates, processName = processName, saveFinalEvents = saveFinalEvents, changeTauId = changeTauId, customizations = jobCustomizations) # Build a function that constructs our log file name given the # job file hash. if not os.path.exists('lxbatch_log'): os.makedirs('lxbatch_log') def log_file_maker(job_hash): return os.path.join( 'lxbatch_log', "_".join( ['run', channel, sample, jobId, job_hash]) + '.log') # Build our batch job jobname, script = jobtools.make_bsub_script( output_file, input_files_and_jobs, log_file_maker, "cmsRun %s" % newConfigFile, pass_io_files = False) bsub_script_file = os.path.join( script_directory, "_".join([ 'analyze'+jobExtention, sample, 'job', str(job), input_file_hash]) + '.sh') with open(bsub_script_file, 'w') as bsub_script: bsub_script.write(script) # Add this bsub to our submission script submit_file.write("bsub -q %s < %s\n" % (queue, bsub_script_file)) print len(tmp_files) garbage = tmp_files - relevant_files print len(garbage) if garbage: print "Found %i files not generated by this job!!" % len(garbage) print " You should really run:" print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm" with open('ana_garbage.txt', 'w') as garbage_script: for file in garbage: garbage_script.write( '%s\n' % os.path.join(outputDirectory, file)) print "Run ./%s to submit jobs" % submit_file_name os.chmod(submit_file_name, 0755) return submit_file_name
bsubScriptFileNames = {} bsubJobNames = {} for sampleToAnalyze in samplesToAnalyze: print "checking sample %s" % sampleToAnalyze bsubFileNames[sampleToAnalyze] = {} bsubScriptFileNames[sampleToAnalyze] = {} bsubJobNames[sampleToAnalyze] = {} inputFilePath = samples[sampleToAnalyze]['skimFilePath'] print " inputFilePath = %s" % inputFilePath inputFileNames = None if inputFilePath.find("/castor") != -1: inputFileNames = [ file_info['path'] for file_info in castor.nslsl(inputFilePath) ] elif inputFilePath.find("/store") != -1: inputFileNames = [ file_info['path'] for file_info in eos.lsl(inputFilePath) ] else: inputFileNames = [ file for file in os.listdir(inputFilePath) ] #print " inputFileNames = %s" % inputFileNames inputFileNames_matched = [ os.path.basename(input_file) for input_file in input_mapper(inputFileNames, sampleToAnalyze) ] #print "inputFileNames_matched = %s" % inputFileNames_matched print "--> found %i inputFiles" % len(inputFileNames_matched) for jobId, inputFileNames_chunk in enumerate(chunks(inputFileNames_matched, samples[sampleToAnalyze]['numInputFilesPerJob'])): # Build script for batch job submission; # the None in the tuple indicates that batch job has no dependencies on other batch jobs input_files_and_jobs = \ [ (None, os.path.join(inputFilePath, inputFileName)) for inputFileName in inputFileNames_chunk ]
# CV: fill mapping of fileName to number of events contained in file into temporary cache # in order to reduce castor file I/O print "initializing mapping of fileNames to number of events contained in each file..." numEventsMap = {} fileNamesToMap = [] for sampleToAnalyze in samplesToAnalyze: for channelToAnalyze in channelsToAnalyze: inputFilePath_channel = os.path.join(inputFilePath, version, channelToAnalyze) inputFileNames = None if inputFilePath_channel.find('/castor/') != -1: inputFileNames = [ file_info['path'] for file_info in castor.nslsl(inputFilePath_channel) ] else: inputFileNames = os.listdir(inputFilePath_channel) for inputFileName in inputFileNames: if inputFileName.find("".join(['_', sampleToAnalyze, '_'])) != -1 or \ inputFileName.find("".join(['/', sampleToAnalyze, '_'])) != -1: fileNamesToMap.append(inputFileName) # CV: request inputFiles located on castor to be prestaged # in order to speed-up computation of numbers of events contained in each file # by 'buildConfigFile_SVfitEventHypothesisAnalyzer' function later if inputFilePath_channel.find('/castor/') != -1: commandLine = '%s -M %s -U myfiles' % (executable_stager, inputFileName) runCommand(commandLine) print " done."
def harvestTauIdEffSamples(channel = None, samples = None, inputFilePath = None, outputFilePath = None, jobId = None, tmpFilePath = None): # check that channel, samples, inputFilePath, outputFilePath, tmpFilePath and jobId # parameters are defined and non-empty if channel is None: raise ValueError("Undefined channel Parameter !!") if samples is None: raise ValueError("Undefined samples Parameter !!") if inputFilePath is None: raise ValueError("Undefined inputFilePath Parameter !!") if outputFilePath is None: raise ValueError("Undefined outputFilePath Parameter !!") if tmpFilePath is None: raise ValueError("Undefined tmpFilePath Parameter !!") if jobId is None: raise ValueError("Undefined jobId Parameter !!") if not os.path.exists(tmpFilePath): os.mkdir(tmpFilePath) if not os.path.exists(outputFilePath): os.mkdir(outputFilePath) # Use CASTOR to find the files to merge print "Finding CASTOR files" print(" inputFilePath = " + inputFilePath) print(" jobId = " + jobId) files_in_castor_info = castor.nslsl(inputFilePath) files_and_times = [ (file_info['time'], file_info['path']) for file_info in files_in_castor_info if file_info['file'].find('_%s_' % jobId) != -1 ] # Sort files by modified time print "Sorting by modified time" files_and_times.sort() #print "files_and_times", files_and_times skim_harvest_jobs = [] for sample in SAMPLES_TO_ANALYZE: print "Finding input files for", sample # Get final event skims that need to be merged event_files_to_merge = list( 'rfio:%s' % file for time, file in files_and_times if file.find('tauIdEffSample_%s_%s_' % (sample, jobId)) != -1) #print "event_files_to_merge", event_files_to_merge skim_output_path = os.path.join( outputFilePath, "tauIdEffSkim_%s_%s.root" % (sample, jobId)) skim_harvest_jobs.append( (sample, skim_output_path, event_files_to_merge)) print "Creating Makefile for skimmed event files" skim_MakefileName = "Makefile.mergeTauIdEffSkims_%s" % (jobId) buildMakefile(skim_harvest_jobs, tmpFilePath, skim_MakefileName, merge_per_job = 7, harvest_tool = 'genericSkimMerger.py') print "Makefile built. In order to start harvesting, execute 'make -f %s -j 8 -k'" % skim_MakefileName
def buildConfigFile_SVfitEventHypothesisAnalyzer(sampleToAnalyze, channelToAnalyze, metResolution, configFileName_template, inputFilePath, numInputFilesPerJob, maxEventsPerJob, configFilePath, logFilePath, outputFilePath, numEventsMap=None): """Build cfg.py file to run SVfit algorithm and fill histograms of SVfit reconstructed mass""" #print "inputFilePath = %s" % inputFilePath inputFileNames = None if inputFilePath.find('/castor/') != -1: inputFileNames = [ file_info['path'] for file_info in castor.nslsl(inputFilePath) ] else: inputFileNames = os.listdir(inputFilePath) #print "inputFileNames = %s" % inputFileNames # check if inputFile matches sampleToAnalyze inputFileNames_sample = [] for inputFileName in inputFileNames: if inputFileName.find("".join(['_', sampleToAnalyze, '_'])) != -1 or \ inputFileName.find("".join(['/', sampleToAnalyze, '_'])) != -1: # CV: assume that input file gets copied to local directory before cmsRun gets started inputFileNames_sample.append(os.path.basename(inputFileName)) #print(sampleToAnalyze) #print(inputFileNames_sample) if len(inputFileNames_sample) == 0: print("Sample %s, channel = %s has no input files --> skipping !!" % (sampleToAnalyze, channelToAnalyze)) return # CV: restrict the number of input files to 50 in order to balance event statistics # (and computing time) for different mass-points if len(inputFileNames_sample) > 50: inputFileNames_sample = inputFileNames_sample[0:50] numInputFiles = len(inputFileNames_sample) numInputFileGroups = (numInputFiles / numInputFilesPerJob) if (numInputFiles % numInputFilesPerJob) != 0: numInputFileGroups = numInputFileGroups + 1 inputFileNameGroups_sample = [] skipEvents_sample = [] for fileId in range(numInputFileGroups): inputFileIdx_first = fileId * numInputFilesPerJob inputFileIdx_last = inputFileIdx_first + numInputFilesPerJob if inputFileIdx_last > len(inputFileNames_sample): inputFileIdx_last = len(inputFileNames_sample) #print "inputFileIdx: first = %i, last = %i" % (inputFileIdx_first, inputFileIdx_last) numEvents = 0 for inputFileIdx in range(inputFileIdx_first, inputFileIdx_last): inputFileName = inputFileNames_sample[fileId] numEvents_i = None if numEventsMap is not None and numEventsMap.has_key( os.path.basename(inputFileName)): numEvents_i = numEventsMap[inputFileName] else: numEvents_i = getNumEvents( os.path.join(inputFilePath, inputFileNames_sample[fileId])) numEvents = numEvents + numEvents_i numJobsPerGroup = (numEvents / maxEventsPerJob) if (numEvents % maxEventsPerJob) != 0: numJobsPerGroup = numJobsPerGroup + 1 print "group of inputFiles = %s contains %i events --> splitting into %i jobs." % \ (inputFileNames_sample[inputFileIdx_first:inputFileIdx_last], numEvents, numJobsPerGroup) for jobId in range(numJobsPerGroup): inputFileNameGroups_sample.append( inputFileNames_sample[inputFileIdx_first:inputFileIdx_last]) skipEvents_sample.append(jobId * maxEventsPerJob) #print "inputFileNameGroups_sample = %s" % inputFileNameGroups_sample configFileNames = [] outputFileNames = [] logFileNames = [] numJobs = len(inputFileNameGroups_sample) for jobId in range(numJobs): inputFileNames_string = "[ " for inputFileName_sample in inputFileNameGroups_sample[jobId]: inputFileNames_string += "'file:%s', " % inputFileName_sample inputFileNames_string += " ]" sample_type = None sample_type_Z_regex = "[Ztautau|ZplusJets|ZToTauTau]" sample_type_Z_matcher = re.compile(sample_type_Z_regex) sample_type_Higgs_regex = "(((gg|bb|vbf)(Higgs|Phi))|HToTauTau_M-)[0-9]+" sample_type_Higgs_matcher = re.compile(sample_type_Higgs_regex) if sample_type_Z_matcher.match(sampleToAnalyze): sample_type = 'Z' elif sample_type_Higgs_matcher.match(sampleToAnalyze): sample_type = 'Higgs' else: raise ValueError( "Failed to determine wether sample = %s is Z or Higgs sample !!" % sampleToAnalyze) metResolution_string = "None" metResolution_label = "pfMEtResMC" if metResolution is not None: metResolution_string = "%f" % metResolution metResolution_label = "pfMEtRes%1.0f" % metResolution metResolution_label = metResolution_label.replace(".", "_") outputFileName = 'svFitPerformanceAnalysisPlots_%s_%s_%s_%i.root' % \ (sampleToAnalyze, channelToAnalyze, metResolution_label, jobId + 1) outputFileNames.append(outputFileName) replacements = [] replacements.append(['sample', "'%s'" % sampleToAnalyze]) replacements.append(['sample_type', "'%s'" % sample_type]) replacements.append(['channel', "'%s'" % channelToAnalyze]) replacements.append(['metResolution', "%s" % metResolution_string]) replacements.append(['skipEvents', "%i" % skipEvents_sample[jobId]]) replacements.append(['maxEvents', "%i" % maxEventsPerJob]) replacements.append(['inputFileNames', "%s" % inputFileNames_string]) replacements.append(['outputFileName', "'%s'" % outputFileName]) configFileName = "svFitPerformanceAnalysisPlots_%s_%s_%s_%i_cfg.py" % \ (sampleToAnalyze, channelToAnalyze, metResolution_label, jobId) configFileName_full = os.path.join(configFilePath, configFileName) replaceConfigFileParam(configFileName_template, configFileName_full, replacements) configFileNames.append(configFileName) logFileName = configFileName.replace('_cfg.py', '.log') logFileName_full = os.path.join(logFilePath, logFileName) logFileNames.append(logFileName) retVal = {} retVal['inputFileNames'] = inputFileNameGroups_sample retVal['configFileNames'] = configFileNames retVal['outputFileNames'] = outputFileNames retVal['logFileNames'] = logFileNames #print " inputFileNames = %s" % inputFileNames_sample #print " configFileNames = %s" % configFileNames #print " outputFileNames = %s" % outputFileNames #print " logFileNames = %s" % logFileNames return retVal