def submitAnalysisToGrid(configFile = None, channel = None, samples = None, outputFilePath = None, jobId = None, samplesToAnalyze = None, samplesToSkip = None, disableFactorization = False, disableSysUncertainties = False, disableZrecoilCorrections = False, create = True, submit = True, cfgdir = 'crab', inputFileMap = None, outputFileMap = None, enableEventDumps = False, enableFakeRates = False, processName = None, savePlots = True, saveFinalEvents = False, outsideCERN = False, useCastor = True, doApplyCfgOptions = True): """ Submit analysis job (event selection, filling of histogram) via crab """ # check that configFile, channel, samples, outputFilePath and jobId # parameters are defined and non-empty if configFile is None: raise ValueError("Undefined configFile Parameter !!") if channel is None: raise ValueError("Undefined channel Parameter !!") if samples is None: raise ValueError("Undefined samples Parameter !!") if outputFilePath is None: raise ValueError("Undefined outputFilePath Parameter !!") if jobId is None: raise ValueError("Undefined jobId Parameter !!") # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue print "Submitting ", sample sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = { 'channel' : channel, 'sample' : sample, 'id' : jobId } #if outsideCERN: # configFile = "%s_cfg.py"% (channel) newConfigFile = getNewConfigFileName(configFile, cfgdir, sample, jobId, label = "@Grid") # Check if we want to use a special file for the produced cfg file # File map is a function that takes a sample name and returns a list of # files corresponding to that file. If files is None, no change will be # made. input_files = None if inputFileMap is not None: input_files = inputFileMap(sample) if input_files is None: print "Warning: No special input files specified for sample%s, using default." % sample output_file = None if outputFileMap is not None: output_file = outputFileMap(channel, sample, jobId) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims jobCustomizations = [] jobCustomizations.append("if hasattr(process, 'skimOutputModule'):") jobCustomizations.append(" process.skimOutputModule.fileName = '%s'" % output_file) #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile = configFile, jobInfo = jobInfo, newConfigFile = newConfigFile, sample_infos = samples, disableFactorization = disableFactorization, disableSysUncertainties = disableSysUncertainties, disableZrecoilCorrections = disableZrecoilCorrections, input_files = input_files, output_file = output_file, enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates, processName = processName, saveFinalEvents = saveFinalEvents, customizations = jobCustomizations, doApplyOptions = doApplyCfgOptions) output_files = [] if output_file is not None: output_files.append(output_file) # Always include the plot files if savePlots: output_files.append("%s_%s_%s_%s.root" % ( PLOT_FILES_PREFIX, jobInfo['channel'], jobInfo['sample'], jobInfo['id'])) # Add our final event skim as well if saveFinalEvents: output_files.append("final_events_%s_%s_%s.root" % ( jobInfo['channel'], jobInfo['sample'], jobInfo['id'])) # Check if we need to reformat the output file path if outputFilePath.startswith('/castor/cern.ch'): outputFilePath = outputFilePath.replace('/castor/cern.ch','') saveNtuple = 0 if 'saveNtuple' in sample_info.keys() and sample_info['saveNtuple'] is True: saveNtuple = 1 # Build crab options crabOptions = { 'number_of_jobs' : _number_of_jobs(sample_info), 'datasetpath' : sample_info['datasetpath'], 'dbs_url' : sample_info['dbs_url'], 'user_remote_dir' : outputFilePath, 'output_file' : ", ".join(output_files), 'get_edm_output' : saveNtuple, # Default MC info 'split_type' : (sample_info['type'] == 'Data' or sample_info['type'] == 'embeddedData') and 'lumis' or 'events', 'lumi_mask' : sample_info['lumi_mask'], 'runselection' : sample_info['runselection'], 'SE_white_list' : sample_info['SE_white_list'], 'SE_black_list' : sample_info['SE_black_list'] } if outsideCERN: crabOptions['use_server'] = 0 crabOptions['scheduler'] = 'condor' if not useCastor: crabOptions['return_data'] = 1 crabOptions['copy_data'] = 0 submitToGrid(newConfigFile, jobInfo, crabOptions, create=create, submit=submit, cfgdir=cfgdir)
def submitAnalysisToLocal(configFile = None, channel = None, samples = None, jobId = None, samplesToAnalyze = None, samplesToSkip = None, disableFactorization = False, disableSysUncertainties = False, disableZrecoilCorrections = False, maxEvents = 25000, maxJobsConcurrently = 8, cfgdir = 'local', submit = False, logFilePath = None, inputFileMap = None, outputFileMap = None, enableEventDumps = False, enableFakeRates = False, processName = None, saveFinalEvents = False): """ Submit analysis job (event selection, filling of histogram) to local machine """ # check that configFile, channel, samples and jobId # parameters are defined and non-empty if configFile is None: raise ValueError("Undefined 'configFile' Parameter !!") if channel is None: raise ValueError("Undefined 'channel' Parameter !!") if samples is None: raise ValueError("Undefined 'samples' Parameter !!") if jobId is None: raise ValueError("Undefined 'jobId' Parameter !!") if logFilePath is None: raise ValueError("Undefined 'logFilePath' Parameter !!") cfgFiles = [] # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = { 'channel' : channel, 'sample' : sample, 'id' : jobId } newConfigFile = getNewConfigFileName(configFile, cfgdir, sample, jobId, label = "@local") fileInfos = getInputFiles(jobInfo, inputFileMap, maxEvents = maxEvents) if len(fileInfos) > 0: print("Submitting %s in %i part(s)" % (sample, len(fileInfos))) else: print("No local input files for %s found !!" % sample) for job, fileInfo in enumerate(fileInfos): print("job %i:" % (job + 1)) sample_infos = copy.deepcopy(samples) sample_infos['RECO_SAMPLES'][sample].update({ 'maxEvents' : fileInfo['maxEvents'] }) sample_infos['RECO_SAMPLES'][sample].update({ 'skipEvents' : fileInfo['skipEvents'] }) input_files = [ fileInfo['fileName'] ] output_file = outputFileMap(channel, sample, jobId) if len(fileInfos) > 1: output_file = output_file.replace(".root", "_%i.root" % (job + 1)) #print(" output_file = %s" % output_file) jobConfigFile = newConfigFile if len(fileInfos) > 1: jobConfigFile = jobConfigFile.replace("_cfg.py", "_%i_cfg.py" % (job + 1)) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement jobCustomizations = [] jobCustomizations.append("if hasattr(process, 'ntupleOutputModule'):") jobCustomizations.append(" process.ntupleOutputModule.fileName = '%s'" % output_file) jobCustomizations.append("if hasattr(process, 'skimOutputModule'):") jobCustomizations.append(" process.skimOutputModule.fileName = '%s'" % output_file) HLTprocessName = 'HLT' if 'hlt' in sample_infos['RECO_SAMPLES'][sample].keys(): HLTprocessName = sample_infos['RECO_SAMPLES'][sample]['hlt'].getProcessName() jobCustomizations.append("if hasattr(process, 'hltMu'):") jobCustomizations.append(" process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName) jobCustomizations.append("process.patTrigger.processName = '%s'" % HLTprocessName) jobCustomizations.append("process.patTriggerEvent.processName = '%s'" % HLTprocessName) if sample_infos['RECO_SAMPLES'][sample]['type'] == 'Data': jobCustomizations.append("if hasattr(process, 'prePatProductionSequence')" + " and hasattr(process, 'prePatProductionSequenceGen'):") jobCustomizations.append(" process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)") jobCustomizations.append("if hasattr(process, 'ntupleProducer'):") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'tauGenJets'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'tauGenJets')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genJets'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genJets')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')") #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile = configFile, jobInfo = jobInfo, newConfigFile = jobConfigFile, sample_infos = sample_infos, disableFactorization = disableFactorization, disableSysUncertainties = disableSysUncertainties, disableZrecoilCorrections = disableZrecoilCorrections, input_files = input_files, output_file = output_file, enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates, processName = processName, saveFinalEvents = saveFinalEvents, customizations = jobCustomizations) cfgFiles.append(jobConfigFile) # Build a script to run all jobs locally script = ''' #!/usr/bin/env python import os import subprocess import shlex import threading import Queue class Worker(threading.Thread): def __init__(self, work_queue): super(Worker, self).__init__() self.work_queue = work_queue def run(self): while True: try: cfgFileName = self.work_queue.get() self.process(cfgFileName) finally: self.work_queue.task_done() ''' script += "\n" script += " def process(self, cfgFileName):\n" script += " logFilePath = '%s'\n" % logFilePath script += " logFileName = os.path.basename(cfgFileName).replace('_cfg.py', '.log')\n" script += " commandLine = 'rm -f %s' % os.path.join(logFilePath, logFileName)\n" script += " commandLine = 'nice cmsRun %s >& %s' % (cfgFileName, os.path.join(logFilePath, logFileName))\n" script += " print('calling %s...' % commandLine)\n" script += " subprocess.call(commandLine, shell = True)\n" script += "\n" script += "work_queue = Queue.Queue()\n" script += "\n" script += "for iThread in range(%i):\n" % maxJobsConcurrently script += " worker = Worker(work_queue)\n" script += " worker.daemon = True\n" script += " worker.start()\n" script += "\n" for cfgFile in cfgFiles: script += "work_queue.put('%s')\n" % cfgFile script += "\n" script += "work_queue.join()\n" scriptFileName = 'local/runAnalysis_%s_%s.py' % (channel, jobId) scriptFile = open(scriptFileName, 'w') scriptFile.write(script) print("Built %s script" % scriptFileName) if submit: subprocess.call("python %s" % scriptFileName, shell = True)
def submitAnalysisToLXBatch(configFile=None, channel=None, samples=None, samplesToAnalyze=None, samplesToSkip=None, disableFactorization=False, disableSysUncertainties=False, disableZrecoilCorrections=False, script_directory=None, cfgdir='lxbatch', inputFileMap=None, outputFileMap=None, outputDirectory=None, queue='1nd', enableEventDumps=False, enableFakeRates=False, processName=None, changeTauId=None, saveFinalEvents=False, jobExtention=''): """ Submit analysis job (event selection, filling of histogram) to local machine """ # check that configFile, channel, samples and jobId # parameters are defined and non-empty for param in ["configFile", "channel", "samples", "outputDirectory"]: if locals()[param] is None: raise ValueError("Undefined '%s' parameter!!" % param) jobId = reg.getJobId(channel) # If not specified take script directory from user preferences. if script_directory is None: script_directory = reg.getHarvestScriptLocation() # Make sure our output file for the scripts is okay if not os.path.exists(script_directory): os.makedirs(script_directory) # Get all the files in our output directory that have non-zero size tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory) if x['size']) # Keep track of the files we care about relevant_files = set([]) submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh' with open(submit_file_name, 'w') as submit_file: # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: write_comment_header(submit_file, " Sample: " + sample) # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = {'channel': channel, 'sample': sample, 'id': jobId} # Now build the scripts to feed to bsub # Find the input files input_files = list(inputFileMap(channel, sample, jobId)) if len(input_files) > 0: print("Submitting %s in %i part(s)" % (sample, len(input_files))) else: print("No local input files for %s found !!" % sample) for job, file in enumerate(input_files): input_files = [file] # The None in the tuple indicates this file has no dependencies in # the batch job. input_files_and_jobs = [(None, file) for file in input_files] # Need to prepend file:, and strip off the directory since we # always have bsub rfcp the input files to the working # directory. input_files_for_cfgOptions = [ 'file:' + os.path.basename(file) for file in input_files ] output_file = outputFileMap(channel, sample, jobId) input_file_hash = jobtools.hash_files(input_files, add_time=False) # Add the hash of the input file so we know the provenance of all # files output_file = os.path.join( outputDirectory, output_file.replace( '.root', '_' + str(job) + '_' + input_file_hash + '.root')) relevant_files.add(os.path.basename(output_file)) # Uncomment to skip rerunning of old jobs #if os.path.basename(output_file) in tmp_files: #print " done; skipping", output_file #continue # First, prepare the configuration file newConfigFile = getNewConfigFileName(configFile, cfgdir, sample, jobId, index=job, label="@lxbatch") write_comment_header(submit_file, " cfg: " + newConfigFile) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement jobCustomizations = [] jobCustomizations.append( "if hasattr(process, 'ntupleOutputModule'):") jobCustomizations.append( " process.ntupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append( "if hasattr(process, 'patTupleOutputModule'):") jobCustomizations.append( " process.patTupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append( "if hasattr(process, 'skimOutputModule'):") jobCustomizations.append( " process.skimOutputModule.fileName = '%s'" % os.path.basename(output_file)) HLTprocessName = 'HLT' if 'hlt' in samples['RECO_SAMPLES'][sample].keys(): HLTprocessName = samples['RECO_SAMPLES'][sample][ 'hlt'].getProcessName() jobCustomizations.append("if hasattr(process, 'hltMu'):") jobCustomizations.append( " process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName) jobCustomizations.append( "if hasattr(process, 'patTrigger'):") jobCustomizations.append( " process.patTrigger.processName = '%s'" % HLTprocessName) jobCustomizations.append( "if hasattr(process, 'patTriggerEvent'):") jobCustomizations.append( " process.patTriggerEvent.processName = '%s'" % HLTprocessName) if samples['RECO_SAMPLES'][sample]['type'] == 'Data': jobCustomizations.append( "if hasattr(process, 'prePatProductionSequence')" + " and hasattr(process, 'prePatProductionSequenceGen'):" ) jobCustomizations.append( " process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)" ) jobCustomizations.append( "if hasattr(process, 'ntupleProducer'):") jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'tauGenJets'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'tauGenJets')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genJets'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genJets')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')" ) jobCustomizations.append( "if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):" ) jobCustomizations.append( " process.patDefaultSequence.replace(process.patTriggerEventSequence," ) jobCustomizations.append( " process.patTriggerSequence + process.patTriggerEventSequence)" ) #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile=configFile, jobInfo=jobInfo, newConfigFile=newConfigFile, sample_infos=samples, disableFactorization=disableFactorization, disableSysUncertainties=disableSysUncertainties, disableZrecoilCorrections=disableZrecoilCorrections, # We always copy the input files to the local directory # before running cmsRun, so just take the basname input_files=input_files_for_cfgOptions, output_file=os.path.basename(output_file), enableEventDumps=enableEventDumps, enableFakeRates=enableFakeRates, processName=processName, saveFinalEvents=saveFinalEvents, changeTauId=changeTauId, customizations=jobCustomizations) # Build a function that constructs our log file name given the # job file hash. if not os.path.exists('lxbatch_log'): os.makedirs('lxbatch_log') def log_file_maker(job_hash): return os.path.join( 'lxbatch_log', "_".join(['run', channel, sample, jobId, job_hash]) + '.log') # Build our batch job jobname, script = jobtools.make_bsub_script( output_file, input_files_and_jobs, log_file_maker, "cmsRun %s" % newConfigFile, pass_io_files=False) bsub_script_file = os.path.join( script_directory, "_".join([ 'analyze' + jobExtention, sample, 'job', str(job), input_file_hash ]) + '.sh') with open(bsub_script_file, 'w') as bsub_script: bsub_script.write(script) # Add this bsub to our submission script submit_file.write("bsub -q %s < %s\n" % (queue, bsub_script_file)) print len(tmp_files) garbage = tmp_files - relevant_files print len(garbage) if garbage: print "Found %i files not generated by this job!!" % len(garbage) print " You should really run:" print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm" with open('ana_garbage.txt', 'w') as garbage_script: for file in garbage: garbage_script.write('%s\n' % os.path.join(outputDirectory, file)) print "Run ./%s to submit jobs" % submit_file_name os.chmod(submit_file_name, 0755) return submit_file_name
def submitAnalysisToLocal(configFile=None, channel=None, samples=None, jobId=None, samplesToAnalyze=None, samplesToSkip=None, disableFactorization=False, disableSysUncertainties=False, disableZrecoilCorrections=False, maxEvents=25000, maxJobsConcurrently=8, cfgdir='local', submit=False, logFilePath=None, inputFileMap=None, outputFileMap=None, enableEventDumps=False, enableFakeRates=False, processName=None, saveFinalEvents=False): """ Submit analysis job (event selection, filling of histogram) to local machine """ # check that configFile, channel, samples and jobId # parameters are defined and non-empty if configFile is None: raise ValueError("Undefined 'configFile' Parameter !!") if channel is None: raise ValueError("Undefined 'channel' Parameter !!") if samples is None: raise ValueError("Undefined 'samples' Parameter !!") if jobId is None: raise ValueError("Undefined 'jobId' Parameter !!") if logFilePath is None: raise ValueError("Undefined 'logFilePath' Parameter !!") cfgFiles = [] # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = {'channel': channel, 'sample': sample, 'id': jobId} newConfigFile = getNewConfigFileName(configFile, cfgdir, sample, jobId, label="@local") fileInfos = getInputFiles(jobInfo, inputFileMap, maxEvents=maxEvents) if len(fileInfos) > 0: print("Submitting %s in %i part(s)" % (sample, len(fileInfos))) else: print("No local input files for %s found !!" % sample) for job, fileInfo in enumerate(fileInfos): print("job %i:" % (job + 1)) sample_infos = copy.deepcopy(samples) sample_infos['RECO_SAMPLES'][sample].update( {'maxEvents': fileInfo['maxEvents']}) sample_infos['RECO_SAMPLES'][sample].update( {'skipEvents': fileInfo['skipEvents']}) input_files = [fileInfo['fileName']] output_file = outputFileMap(channel, sample, jobId) if len(fileInfos) > 1: output_file = output_file.replace(".root", "_%i.root" % (job + 1)) #print(" output_file = %s" % output_file) jobConfigFile = newConfigFile if len(fileInfos) > 1: jobConfigFile = jobConfigFile.replace("_cfg.py", "_%i_cfg.py" % (job + 1)) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement jobCustomizations = [] jobCustomizations.append( "if hasattr(process, 'ntupleOutputModule'):") jobCustomizations.append( " process.ntupleOutputModule.fileName = '%s'" % output_file) jobCustomizations.append( "if hasattr(process, 'skimOutputModule'):") jobCustomizations.append( " process.skimOutputModule.fileName = '%s'" % output_file) HLTprocessName = 'HLT' if 'hlt' in sample_infos['RECO_SAMPLES'][sample].keys(): HLTprocessName = sample_infos['RECO_SAMPLES'][sample][ 'hlt'].getProcessName() jobCustomizations.append("if hasattr(process, 'hltMu'):") jobCustomizations.append( " process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName) jobCustomizations.append("process.patTrigger.processName = '%s'" % HLTprocessName) jobCustomizations.append( "process.patTriggerEvent.processName = '%s'" % HLTprocessName) if sample_infos['RECO_SAMPLES'][sample]['type'] == 'Data': jobCustomizations.append( "if hasattr(process, 'prePatProductionSequence')" + " and hasattr(process, 'prePatProductionSequenceGen'):") jobCustomizations.append( " process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)" ) jobCustomizations.append( "if hasattr(process, 'ntupleProducer'):") jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'tauGenJets'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'tauGenJets')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genJets'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genJets')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')" ) #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile=configFile, jobInfo=jobInfo, newConfigFile=jobConfigFile, sample_infos=sample_infos, disableFactorization=disableFactorization, disableSysUncertainties=disableSysUncertainties, disableZrecoilCorrections=disableZrecoilCorrections, input_files=input_files, output_file=output_file, enableEventDumps=enableEventDumps, enableFakeRates=enableFakeRates, processName=processName, saveFinalEvents=saveFinalEvents, customizations=jobCustomizations) cfgFiles.append(jobConfigFile) # Build a script to run all jobs locally script = ''' #!/usr/bin/env python import os import subprocess import shlex import threading import Queue class Worker(threading.Thread): def __init__(self, work_queue): super(Worker, self).__init__() self.work_queue = work_queue def run(self): while True: try: cfgFileName = self.work_queue.get() self.process(cfgFileName) finally: self.work_queue.task_done() ''' script += "\n" script += " def process(self, cfgFileName):\n" script += " logFilePath = '%s'\n" % logFilePath script += " logFileName = os.path.basename(cfgFileName).replace('_cfg.py', '.log')\n" script += " commandLine = 'rm -f %s' % os.path.join(logFilePath, logFileName)\n" script += " commandLine = 'nice cmsRun %s >& %s' % (cfgFileName, os.path.join(logFilePath, logFileName))\n" script += " print('calling %s...' % commandLine)\n" script += " subprocess.call(commandLine, shell = True)\n" script += "\n" script += "work_queue = Queue.Queue()\n" script += "\n" script += "for iThread in range(%i):\n" % maxJobsConcurrently script += " worker = Worker(work_queue)\n" script += " worker.daemon = True\n" script += " worker.start()\n" script += "\n" for cfgFile in cfgFiles: script += "work_queue.put('%s')\n" % cfgFile script += "\n" script += "work_queue.join()\n" scriptFileName = 'local/runAnalysis_%s_%s.py' % (channel, jobId) scriptFile = open(scriptFileName, 'w') scriptFile.write(script) print("Built %s script" % scriptFileName) if submit: subprocess.call("python %s" % scriptFileName, shell=True)
def submitAnalysisToLXBatch(configFile = None, channel = None, samples = None, samplesToAnalyze = None, samplesToSkip = None, disableFactorization = False, disableSysUncertainties = False, disableZrecoilCorrections = False, script_directory=None, cfgdir = 'lxbatch', inputFileMap = None, outputFileMap = None, outputDirectory = None, queue = '1nd', enableEventDumps = False, enableFakeRates = False, processName = None, changeTauId = None, saveFinalEvents = False, jobExtention = ''): """ Submit analysis job (event selection, filling of histogram) to local machine """ # check that configFile, channel, samples and jobId # parameters are defined and non-empty for param in ["configFile", "channel", "samples", "outputDirectory"]: if locals()[param] is None: raise ValueError("Undefined '%s' parameter!!" % param) jobId = reg.getJobId(channel) # If not specified take script directory from user preferences. if script_directory is None: script_directory = reg.getHarvestScriptLocation() # Make sure our output file for the scripts is okay if not os.path.exists(script_directory): os.makedirs(script_directory) # Get all the files in our output directory that have non-zero size tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory) if x['size']) # Keep track of the files we care about relevant_files = set([]) submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh' with open(submit_file_name, 'w') as submit_file: # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: write_comment_header(submit_file, " Sample: " + sample) # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = { 'channel' : channel, 'sample' : sample, 'id' : jobId } # Now build the scripts to feed to bsub # Find the input files input_files = list(inputFileMap(channel, sample, jobId)) if len(input_files) > 0: print("Submitting %s in %i part(s)" % (sample, len(input_files))) else: print("No local input files for %s found !!" % sample) for job, file in enumerate(input_files): input_files = [file] # The None in the tuple indicates this file has no dependencies in # the batch job. input_files_and_jobs = [ (None, file) for file in input_files ] # Need to prepend file:, and strip off the directory since we # always have bsub rfcp the input files to the working # directory. input_files_for_cfgOptions = [ 'file:' + os.path.basename(file) for file in input_files] output_file = outputFileMap(channel, sample, jobId) input_file_hash = jobtools.hash_files( input_files, add_time=False) # Add the hash of the input file so we know the provenance of all # files output_file = os.path.join(outputDirectory, output_file.replace( '.root', '_' + str(job) + '_' + input_file_hash + '.root')) relevant_files.add(os.path.basename(output_file)) # Uncomment to skip rerunning of old jobs #if os.path.basename(output_file) in tmp_files: #print " done; skipping", output_file #continue # First, prepare the configuration file newConfigFile = getNewConfigFileName( configFile, cfgdir, sample, jobId, index = job, label = "@lxbatch") write_comment_header(submit_file, " cfg: " + newConfigFile) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement jobCustomizations = [] jobCustomizations.append("if hasattr(process, 'ntupleOutputModule'):") jobCustomizations.append(" process.ntupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append("if hasattr(process, 'patTupleOutputModule'):") jobCustomizations.append(" process.patTupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append("if hasattr(process, 'skimOutputModule'):") jobCustomizations.append(" process.skimOutputModule.fileName = '%s'" % os.path.basename(output_file)) HLTprocessName = 'HLT' if 'hlt' in samples['RECO_SAMPLES'][sample].keys(): HLTprocessName = samples['RECO_SAMPLES'][sample]['hlt'].getProcessName() jobCustomizations.append("if hasattr(process, 'hltMu'):") jobCustomizations.append(" process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName) jobCustomizations.append("if hasattr(process, 'patTrigger'):") jobCustomizations.append(" process.patTrigger.processName = '%s'" % HLTprocessName) jobCustomizations.append("if hasattr(process, 'patTriggerEvent'):") jobCustomizations.append(" process.patTriggerEvent.processName = '%s'" % HLTprocessName) if samples['RECO_SAMPLES'][sample]['type'] == 'Data': jobCustomizations.append("if hasattr(process, 'prePatProductionSequence')" + " and hasattr(process, 'prePatProductionSequenceGen'):") jobCustomizations.append(" process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)") jobCustomizations.append("if hasattr(process, 'ntupleProducer'):") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'tauGenJets'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'tauGenJets')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genJets'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genJets')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')") jobCustomizations.append("if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):") jobCustomizations.append(" process.patDefaultSequence.replace(process.patTriggerEventSequence,") jobCustomizations.append(" process.patTriggerSequence + process.patTriggerEventSequence)") #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile = configFile, jobInfo = jobInfo, newConfigFile = newConfigFile, sample_infos = samples, disableFactorization = disableFactorization, disableSysUncertainties = disableSysUncertainties, disableZrecoilCorrections = disableZrecoilCorrections, # We always copy the input files to the local directory # before running cmsRun, so just take the basname input_files = input_files_for_cfgOptions, output_file = os.path.basename(output_file), enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates, processName = processName, saveFinalEvents = saveFinalEvents, changeTauId = changeTauId, customizations = jobCustomizations) # Build a function that constructs our log file name given the # job file hash. if not os.path.exists('lxbatch_log'): os.makedirs('lxbatch_log') def log_file_maker(job_hash): return os.path.join( 'lxbatch_log', "_".join( ['run', channel, sample, jobId, job_hash]) + '.log') # Build our batch job jobname, script = jobtools.make_bsub_script( output_file, input_files_and_jobs, log_file_maker, "cmsRun %s" % newConfigFile, pass_io_files = False) bsub_script_file = os.path.join( script_directory, "_".join([ 'analyze'+jobExtention, sample, 'job', str(job), input_file_hash]) + '.sh') with open(bsub_script_file, 'w') as bsub_script: bsub_script.write(script) # Add this bsub to our submission script submit_file.write("bsub -q %s < %s\n" % (queue, bsub_script_file)) print len(tmp_files) garbage = tmp_files - relevant_files print len(garbage) if garbage: print "Found %i files not generated by this job!!" % len(garbage) print " You should really run:" print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm" with open('ana_garbage.txt', 'w') as garbage_script: for file in garbage: garbage_script.write( '%s\n' % os.path.join(outputDirectory, file)) print "Run ./%s to submit jobs" % submit_file_name os.chmod(submit_file_name, 0755) return submit_file_name