inputFileNames_chunk, configFile, configFileName) outputFileName = output_mapper(sampleToAnalyze, jobId, version) logFileName = os.path.basename( configFileName.replace('_cfg.py', '.log')) def log_file_maker(job_hash): return os.path.join(logFilePath, logFileName) histogramFileName = 'ZllRecoilCorrectionPUreweightHistograms_%s_%s_%s.root' % ( sampleToAnalyze, version, jobId) jobName, bsubScript = make_bsub_script( os.path.join(outputFileMachine_and_Path, outputFileName), input_files_and_jobs, log_file_maker, "cmsRun %s" % os.path.join(configFilePath, configFileName), add_output_files=[histogramFileName]) bsubFileNames[sampleToAnalyze][jobId] = [ outputFileName, histogramFileName ] bsubScriptFileName = \ os.path.join(configFilePath, logFileName.replace(".log", ".sh")) bsubScriptFile = open(bsubScriptFileName, "w") bsubScriptFile.write(bsubScript) bsubScriptFile.close() bsubScriptFileNames[sampleToAnalyze][jobId] = bsubScriptFileName bsubJobName = "ZllRecoilPATtuple%s_%i" % (sampleToAnalyze, jobId)
configFileName = os.path.join(configFilePath, configFile.replace("_cfg.py", "_%s_%s_cfg.py" % (sampleToAnalyze, jobId))) #print " configFileName = %s" % configFileName customizeConfigFile(sampleToAnalyze, jobId, version, inputFileNames_chunk, configFile, configFileName) outputFileName = output_mapper(sampleToAnalyze, jobId, version) logFileName = os.path.basename(configFileName.replace('_cfg.py', '.log')) def log_file_maker(job_hash): return os.path.join(logFilePath, logFileName) histogramFileName = 'ZllRecoilCorrectionPUreweightHistograms_%s_%s_%s.root' % (sampleToAnalyze, version, jobId) jobName, bsubScript = make_bsub_script( os.path.join(outputFileMachine_and_Path, outputFileName), input_files_and_jobs, log_file_maker, "cmsRun %s" % os.path.join(configFilePath, configFileName), add_output_files = [ histogramFileName ]) bsubFileNames[sampleToAnalyze][jobId] = [ outputFileName, histogramFileName ] bsubScriptFileName = \ os.path.join(configFilePath, logFileName.replace(".log", ".sh")) bsubScriptFile = open(bsubScriptFileName, "w") bsubScriptFile.write(bsubScript) bsubScriptFile.close() bsubScriptFileNames[sampleToAnalyze][jobId] = bsubScriptFileName bsubJobName = "ZllRecoilPATtuple%s_%i" % (sampleToAnalyze, jobId) bsubJobNames[sampleToAnalyze][jobId] = bsubJobName #--------------------------------------------------------------------------------
#print " configFileName = %s" % configFileName customizeConfigFile(sampleToAnalyze, jobNumber, version, inputFileNames_chunk, outputFileName, configFile_original, configFileName) logFileName = os.path.basename( configFileName.replace('_cfg.py', '.log')) #print " logFileName = %s" % logFileName def log_file_maker(job_hash): return os.path.join(logFilePath, logFileName) bsubId = "%s_%i" % (jobType, jobNumber) jobName, bsubScript = make_bsub_script( os.path.join(outputFilePath, outputFileName), input_files_and_jobs, log_file_maker, "cmsRun %s" % os.path.join(configFilePath, configFileName)) bsubFileNames[sampleToAnalyze][bsubId] = [outputFileName] bsubScriptFileName = os.path.join( configFilePath, logFileName.replace(".log", ".sh")) bsubScriptFile = open(bsubScriptFileName, "w") bsubScriptFile.write(bsubScript) bsubScriptFile.close() bsubScriptFileNames[sampleToAnalyze][bsubId] = bsubScriptFileName bsubJobName = "tauIdEffPATtuple_%s%s" % (sampleToAnalyze, bsubId) bsubJobNames[sampleToAnalyze][bsubId] = bsubJobName # create "master" shell script
for inputFileName in retVal_SVfitEventHypothesisAnalyzer['inputFileNames'][i] ] def log_file_maker(job_hash): log_fileName = os.path.join( logFilePath, retVal_SVfitEventHypothesisAnalyzer['logFileNames'][i]) # CV: delete log-files from previous job submissions os.system("rm -f %s" % log_fileName) return log_fileName # Build script for batch job submission jobName, bsubScript = make_bsub_script( os.path.join( harvestingFilePath, retVal_SVfitEventHypothesisAnalyzer['outputFileNames'] [i]), input_files_and_jobs, log_file_maker, "%s %s" % (executable_cmsRun, os.path.join( configFilePath_channel, retVal_SVfitEventHypothesisAnalyzer['configFileNames'] [i]))) #print "configFilePath_channel = %s" % configFilePath_channel #print "retVal_SVfitEventHypothesisAnalyzer['logFileNames'][i] = %s" % \ # retVal_SVfitEventHypothesisAnalyzer['logFileNames'][i] bsubScriptFileName = \ os.path.join(configFilePath_channel, retVal_SVfitEventHypothesisAnalyzer['logFileNames'][i].replace(".log", ".sh")) bsubScriptFile = open(bsubScriptFileName, "w") bsubScriptFile.write(bsubScript) bsubScriptFile.close() time.sleep(
# The None in the tuple indicates that batch job has no dependencies on other batch jobs input_files_and_jobs = \ [ (None, os.path.join(inputFilePath, version, channelToAnalyze, inputFileName)) \ for inputFileName in retVal_SVfitEventHypothesisAnalyzer['inputFileNames'][i] ] def log_file_maker(job_hash): log_fileName = os.path.join(logFilePath, retVal_SVfitEventHypothesisAnalyzer['logFileNames'][i]) # CV: delete log-files from previous job submissions os.system("rm -f %s" % log_fileName) return log_fileName # Build script for batch job submission jobName, bsubScript = make_bsub_script( os.path.join(harvestingFilePath, retVal_SVfitEventHypothesisAnalyzer['outputFileNames'][i]), input_files_and_jobs, log_file_maker, "%s %s" % (executable_cmsRun, os.path.join(configFilePath_channel, retVal_SVfitEventHypothesisAnalyzer['configFileNames'][i]))) #print "configFilePath_channel = %s" % configFilePath_channel #print "retVal_SVfitEventHypothesisAnalyzer['logFileNames'][i] = %s" % \ # retVal_SVfitEventHypothesisAnalyzer['logFileNames'][i] bsubScriptFileName = \ os.path.join(configFilePath_channel, retVal_SVfitEventHypothesisAnalyzer['logFileNames'][i].replace(".log", ".sh")) bsubScriptFile = open(bsubScriptFileName, "w") bsubScriptFile.write(bsubScript) bsubScriptFile.close() time.sleep(0.100) # CV: wait for 100 milliseconds in order to avoid opening/closing too many files in too short time fileNameEntry['bsubScriptFileNames'].append(bsubScriptFileName)
def make_harvest_scripts( plot_regex, skim_regex, channel="", sampleToAnalyze="", job_id=None, # An iterable that gives the input files input_source=None, # Pass input_files_info explicitely in case input files # do not yet exist an time when make_harvest_scripts is called input_files_info=None, # Allow to switch between 'genericHarvester.py' and 'hadd' harvester_command=_HARVESTER_CMD, # Abort harvesting/merging scripts in case input files fail to get copied abort_on_rfcp_error=True, # Where to put the output castor_output_directory=None, script_directory=None, merge_script_name=None, local_copy_mapper=None, chunk_size=1e9, # 1 GB max_input_files_per_chunk=50, run_harvesting=True, run_merging=True, check_old_files=True, max_bsub_concurrent_file_access=2000, verbosity=1): # Get the jobId from the user registry if job_id is None: job_id = reg.getJobId(channel) if script_directory is None: script_directory = reg.getHarvestScriptLocation() # Create the directory where we store the scripts if it doesn't exist if not os.path.exists(script_directory): os.mkdir(script_directory) if input_files_info is None: # Get all files with nonzero size in the input castor directory print "Getting files to harvest from input, input_source = %s" % input_source input_files_info = [x for x in input_source if x['size']] #print "input_files_info = %s" % input_files_info print "Getting files from destination" # Get all the tmp files (that are non-zero) if castor_output_directory.find("/castor") != -1: tmp_files_info = [ x for x in castor_source(castor_output_directory) if x['size'] ] elif castor_output_directory.find("/store") != -1: tmp_files_info = [ x for x in eos_source(castor_output_directory) if x['size'] ] else: local_output_directory = None if castor_output_directory.find(":") != -1: local_output_directory = castor_output_directory[ castor_output_directory.find(":") + 1:] else: local_output_directory = castor_output_directory tmp_files_info = [] for x in os.listdir(local_output_directory): file_info = { 'path': os.path.join(local_output_directory, os.path.basename(x)), 'size': 1, # dummy 'time': time.localtime(), 'file': os.path.basename(x), 'permissions': 'mrw-r--r--' # "ordinary" file access permissions } tmp_files_info.append(file_info) tmp_files = set(x['file'] for x in tmp_files_info) #print "tmp_files_info = %s" % tmp_files_info # Make a repository of info about our files all_files_dict = {} for file_info in input_files_info + tmp_files_info: all_files_dict[file_info['path']] = file_info #print "all_files_dict = %s" % all_files_dict # Keep track of files that we put in tmp with these jobs and that we care # about. We can stop caring about old files if after adding new files (i.e. # crab jobs finishing), the merge block they were in contains a different # set of files. relevant_tmp_files = set([]) # Keep track of the final harvested output final_harvest_files = [] harvest_script_name = "_".join( ['submit', channel, sampleToAnalyze, job_id, 'harvest']) + '.sh' harvest_log = open( '_'.join( ('harvest', channel, sampleToAnalyze, job_id, 'log')) + '.txt', 'a') # Keep track of the names of lxbatch jobs bsub_job_names = [] if run_harvesting: # Select those that match our given regex for a sample plot_file_map = defaultdict(list) # Keep track of a hash of plot_matcher = re.compile(plot_regex) plot_source_hashes = defaultdict(hashlib.md5) for file in input_files_info: match = plot_matcher.match(file['file']) if match: full_file = file['path'] sample = None if sampleToAnalyze != "": sample = sampleToAnalyze else: sample = match.group('sample') plot_file_map[sample].append(full_file) plot_source_hashes[sample].update(full_file) harvest_log.write('=========== %s ================' % time.asctime()) for sample in sorted(plot_source_hashes.keys()): # Write down the hash of the input files harvest_log.write(" ".join([ channel, sample, job_id, '<harvest>' 'input[%s]' % plot_source_hashes[sample].hexdigest() ]) + '\n') submit_file = open(harvest_script_name, 'w') # Make the bsub scripts submit_file.write("#!/bin/bash\n") # Now build mergers for each of the samples # Count how many bsubs we have created, so we can put pauses in to # thwart the rate limit. bsub_file_access_counter = 0 for sample in plot_file_map.keys(): #if sample.find('NoPU') == -1: # continue # Add helpful comments write_comment_header( submit_file, "Harvesting channel %s, sample %s" % (channel, sample)) print " Building harvesting for channel %s, sample %s" % (channel, sample) print " -- Found %i files to harvest" % len(plot_file_map[sample]) # Build merge tree. We add the source has to the sample name. split = 6 if 'PPmuX' in sample: print "High yield sample %s detected, setting split to 4" % sample split = 4 merge_jobs = jobtools.make_merge_dependency_tree( "_".join([channel, sample, job_id]), plot_file_map[sample], local_output_directory, split=split) #print "merge_jobs = %s" % merge_jobs # Only do work that hasn't been done before. We can check and see # if the output of a given merge layer is already in the temp # directory. As the filenames contain a suffix with the hash of the # input file names, we can be sure that if a file is out of date we # will notice. merge_jobs_needed = [] files_to_build = set([]) print " --- Generated %i harvest layers:" % len(merge_jobs) for i, layer in enumerate(merge_jobs): # Figure out how many files we need to build layer_jobs_needed = [] for layer_job in layer: #print "layer_job = ", layer_job # Check if we've already built this output file in the tmp file_base_name = os.path.basename(layer_job[0]) needed = True # Check if we are rebuilding a dependency building_a_dependency = any(file in files_to_build for file in layer_job[1]) if not building_a_dependency and file_base_name in tmp_files: output_m_time = all_files_dict[layer_job[0]]['time'] out_of_date = False for input_file in layer_job[1]: #print "input_file = ", input_file if not input_file in all_files_dict.keys( ) or all_files_dict[input_file][ 'time'] > output_m_time: print "File: %s is older than its dependency %s, rebuilding!" % ( file_base_name, input_file) # Check if it's out of date out_of_date = True break if not out_of_date: needed = False if needed: layer_jobs_needed.append(layer_job) # Keep track of the relevant files, so we can delete old # cruft relevant_tmp_files.add(file_base_name) # Check if this is the final output layer if len(layer) == 1: final_harvest_files.append((sample, layer[0][0])) print " ---- layer %i has %i jobs, of which %i not done" % ( i, len(layer), len(layer_jobs_needed)) merge_jobs_needed.append(layer_jobs_needed) # Keep track of what jobId was used for a paticular output file job_registry = {} # If a file is not produced by a job (already exists in CASTOR/EOS), # then the job ID returned is none. get_job_name = lambda x: x in job_registry and job_registry[ x] or None for ilayer, layer in enumerate(merge_jobs_needed): write_comment_header(submit_file, "Layer %i" % ilayer) submit_file.write( "echo Submitting layer %i of channel %s, sample %s\n" % (ilayer, channel, sample)) for ijob, (output_file, input_files) in enumerate(layer): # Get the job name (if it exists) and file name for the # input files. input_files_and_jobs = [(get_job_name(file), file) for file in input_files] # Build a function that constructs our log file name given the # job file hash. # Build a function that constructs our log file name given the # job file hash. if not os.path.exists('lxbatch_log'): os.makedirs('lxbatch_log') def log_file_maker(job_hash): log_fileName = os.path.join( 'lxbatch_log', "_".join( ['harvest', job_hash, 'layer_%i' % ilayer]) + '.log') # CV: delete log-files from previous job submissions os.system("rm -f %s" % log_fileName) return log_fileName # Build the script job_name, script = jobtools.make_bsub_script( output_file, input_files_and_jobs, log_file_maker, harvester_command, abort_on_rfcp_error=abort_on_rfcp_error, label="_".join([ "harvest", channel, sample, "layer", str(ilayer), "job", str(ijob) ])) #print "job_name = %s" % job_name bsub_job_names.append(job_name) # Register our job #print "--> registering channel %s, sample %s, jobId %s" % (channel, sample, job_id) #print " script_directory = %s" % script_directory job_registry[output_file] = job_name #print " job_registry[%s] = %s" % (output_file, job_registry[output_file]) script_file = os.path.join( script_directory, "_".join([ "harvest", channel, sample, job_id, "layer", str(ilayer), "job", str(ijob) ]) + ".sh") submit_file.write("bsub < %s\n" % script_file) # Keep track of how many files we access bsub_file_access_counter += split if bsub_file_access_counter > max_bsub_concurrent_file_access: bsub_file_access_counter = 0 submit_file.write("# thwart rate limit\n") submit_file.write( "echo Sleeping for 500 seconds, it is now:\n") submit_file.write("date\n") submit_file.write("sleep 500\n") with open(script_file, 'w') as script_file: script_file.write(script) submit_file.close() if run_merging: # Now build skim merging file # Select those that match our given regex for a sample skim_file_map = defaultdict(list) skim_fileinhash_map = defaultdict(hashlib.md5) skim_fileouthash_map = defaultdict(hashlib.md5) skim_matcher = re.compile(skim_regex) for file in input_files_info: match = skim_matcher.match(file['file']) if match: full_file = file['path'] # Parse the sample from the regex sample = None if sampleToAnalyze != "": sample = sampleToAnalyze else: sample = match.group('sample') # For the skims, keep track of the file size well, since we use it # to group the jobs. skim_file_map[sample].append( (file['time'], file['size'], full_file)) # Keep track of the hash of all input files so we know what went # into our output skim_fileinhash_map[sample].update(full_file) def make_skim_name(sample, chunk, hash): " Generate a nice name for an output skim " return "_".join(["skim", sample, "chunk", str(chunk), hash]) + ".root" if merge_script_name is None: merge_script_name = "_".join( ['submit', channel, sampleToAnalyze, job_id, 'merge']) + '.sh' with open(merge_script_name, 'w') as merge_script: merge_jobs_counter = 0 bsub_file_access_counter = 0 for sample in skim_file_map.keys(): write_comment_header(merge_script, " Merging " + sample) print "Merging channel %s, sample %s" % (channel, sample) files = skim_file_map[sample] num_files = len(files) total_file_size = sum(map(lambda x: x[1], files)) / 1e6 # Divide the job up into chunks that are about 1 GB each in size chunks = list( jobtools.split(files, chunk_size, max_input_files_per_chunk, lambda x: x[1])) print " Total sample size: %i files, %i MB - splitting into %i chunks" % ( num_files, total_file_size, len(chunks)), # Keep track of jobs we are actually running skim_merge_jobs = [] for ichunk, input_files in enumerate(chunks): # Figure out the name for our file. It contains a hash of its # inputs. We don't add the time, as we never have any LXBatch # job dependencies. just_the_files = [x[2] for x in input_files] output_file = make_skim_name( "_".join([channel, sample, job_id]), ichunk, jobtools.hash_files(just_the_files, add_time=False)) skim_fileouthash_map[sample].update(output_file) relevant_tmp_files.add(output_file) if output_file not in tmp_files: output_file_full_path = os.path.join( castor_output_directory, output_file) # Add "None" as the job id of the input files to indicate we # don't care about any dependencies. The index on x takes # out only the filename, not the size or the time. skim_merge_jobs.append((output_file_full_path, map(lambda x: (None, x[2]), input_files))) print " -- %i chunks are already done, skipping" % ( len(chunks) - len(skim_merge_jobs)) for ijob, (output_file, input_files) in enumerate(skim_merge_jobs): def merge_log_file_maker(job_hash): log_fileName = os.path.join( 'lxbatch_log', "_".join(['merge', job_hash, 'job_%i' % ijob]) + '.log') # CV: delete log-files from previous job submissions os.system("rm -f %s" % log_fileName) return log_fileName # Generate script contents job_name, script = jobtools.make_bsub_script( output_file, input_files, merge_log_file_maker, _MERGER_CMD, abort_on_rfcp_error=abort_on_rfcp_error, label="_".join( ["merge", channel, sample, "chunk", str(ijob)])) script_file = os.path.join( script_directory, "_".join([ "merge", channel, sample, job_id, "chunk", str(ijob) ]) + ".sh") # Add our bsub command merge_script.write("bsub < %s\n" % script_file) merge_jobs_counter += 1 bsub_file_access_counter += len(input_files) if bsub_file_access_counter > max_bsub_concurrent_file_access: bsub_file_access_counter = 0 merge_script.write("# thwart rate limit\n") merge_script.write( "echo Sleeping for 500 seconds, it is now:\n") merge_script.write('date\n') merge_script.write("sleep 500\n") with open(script_file, 'w') as script_file: script_file.write(script) print " Built %i merge jobs in %s" % (merge_jobs_counter, merge_script_name) for sample in sorted(skim_fileinhash_map.keys()): # Write down the hash of the input files harvest_log.write(" ".join([ job_id, sample, '<merge>' 'input[%s]' % skim_fileinhash_map[sample].hexdigest(), 'output[%s]' % skim_fileouthash_map[sample].hexdigest(), ]) + '\n') if check_old_files: # Compute all the extra tmp files extra_crap = tmp_files - relevant_tmp_files if extra_crap: print " Found %i extra files from previous harvest jobs" % len( extra_crap) print " Writing these files to file garbage.txt. To delete, please run: " print " cat garbage.txt | xargs -P 10 -n 1 rfrm " with open('garbage.txt', 'w') as garbage_file: for file in extra_crap: full_path = os.path.join(castor_output_directory, file) garbage_file.write(full_path + '\n') local_copy_script = 'copy_harvest_local_%s.sh' % job_id with open(local_copy_script, 'w') as copy_script: for sample, file in final_harvest_files: copy_script.write('rfcp %s %s &\n' % (file, local_copy_mapper(sample))) copy_script.write('wait\n') if verbosity > 0: if run_harvesting: print "To harvest plots, run %s" % harvest_script_name if run_merging: print "To merge skims, run %s" % merge_script_name print "After harvesting is done, run %s to copy files locally" % local_copy_script # Make all our stuff executable os.chmod(local_copy_script, 0755) if run_harvesting: os.chmod(harvest_script_name, 0755) if run_merging: os.chmod(merge_script_name, 0755) # Return name of scripts retVal = {} retVal['harvest_script_name'] = harvest_script_name retVal['bsub_job_names'] = bsub_job_names retVal['final_harvest_files'] = final_harvest_files retVal['merge_script_name'] = merge_script_name retVal['local_copy_script'] = local_copy_script return retVal
def submitAnalysisToLXBatch(configFile=None, channel=None, samples=None, samplesToAnalyze=None, samplesToSkip=None, disableFactorization=False, disableSysUncertainties=False, disableZrecoilCorrections=False, script_directory=None, cfgdir='lxbatch', inputFileMap=None, outputFileMap=None, outputDirectory=None, queue='1nd', enableEventDumps=False, enableFakeRates=False, processName=None, changeTauId=None, saveFinalEvents=False, jobExtention=''): """ Submit analysis job (event selection, filling of histogram) to local machine """ # check that configFile, channel, samples and jobId # parameters are defined and non-empty for param in ["configFile", "channel", "samples", "outputDirectory"]: if locals()[param] is None: raise ValueError("Undefined '%s' parameter!!" % param) jobId = reg.getJobId(channel) # If not specified take script directory from user preferences. if script_directory is None: script_directory = reg.getHarvestScriptLocation() # Make sure our output file for the scripts is okay if not os.path.exists(script_directory): os.makedirs(script_directory) # Get all the files in our output directory that have non-zero size tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory) if x['size']) # Keep track of the files we care about relevant_files = set([]) submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh' with open(submit_file_name, 'w') as submit_file: # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: write_comment_header(submit_file, " Sample: " + sample) # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = {'channel': channel, 'sample': sample, 'id': jobId} # Now build the scripts to feed to bsub # Find the input files input_files = list(inputFileMap(channel, sample, jobId)) if len(input_files) > 0: print("Submitting %s in %i part(s)" % (sample, len(input_files))) else: print("No local input files for %s found !!" % sample) for job, file in enumerate(input_files): input_files = [file] # The None in the tuple indicates this file has no dependencies in # the batch job. input_files_and_jobs = [(None, file) for file in input_files] # Need to prepend file:, and strip off the directory since we # always have bsub rfcp the input files to the working # directory. input_files_for_cfgOptions = [ 'file:' + os.path.basename(file) for file in input_files ] output_file = outputFileMap(channel, sample, jobId) input_file_hash = jobtools.hash_files(input_files, add_time=False) # Add the hash of the input file so we know the provenance of all # files output_file = os.path.join( outputDirectory, output_file.replace( '.root', '_' + str(job) + '_' + input_file_hash + '.root')) relevant_files.add(os.path.basename(output_file)) # Uncomment to skip rerunning of old jobs #if os.path.basename(output_file) in tmp_files: #print " done; skipping", output_file #continue # First, prepare the configuration file newConfigFile = getNewConfigFileName(configFile, cfgdir, sample, jobId, index=job, label="@lxbatch") write_comment_header(submit_file, " cfg: " + newConfigFile) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement jobCustomizations = [] jobCustomizations.append( "if hasattr(process, 'ntupleOutputModule'):") jobCustomizations.append( " process.ntupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append( "if hasattr(process, 'patTupleOutputModule'):") jobCustomizations.append( " process.patTupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append( "if hasattr(process, 'skimOutputModule'):") jobCustomizations.append( " process.skimOutputModule.fileName = '%s'" % os.path.basename(output_file)) HLTprocessName = 'HLT' if 'hlt' in samples['RECO_SAMPLES'][sample].keys(): HLTprocessName = samples['RECO_SAMPLES'][sample][ 'hlt'].getProcessName() jobCustomizations.append("if hasattr(process, 'hltMu'):") jobCustomizations.append( " process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName) jobCustomizations.append( "if hasattr(process, 'patTrigger'):") jobCustomizations.append( " process.patTrigger.processName = '%s'" % HLTprocessName) jobCustomizations.append( "if hasattr(process, 'patTriggerEvent'):") jobCustomizations.append( " process.patTriggerEvent.processName = '%s'" % HLTprocessName) if samples['RECO_SAMPLES'][sample]['type'] == 'Data': jobCustomizations.append( "if hasattr(process, 'prePatProductionSequence')" + " and hasattr(process, 'prePatProductionSequenceGen'):" ) jobCustomizations.append( " process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)" ) jobCustomizations.append( "if hasattr(process, 'ntupleProducer'):") jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'tauGenJets'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'tauGenJets')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genJets'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genJets')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')" ) jobCustomizations.append( " if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):" ) jobCustomizations.append( " delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')" ) jobCustomizations.append( "if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):" ) jobCustomizations.append( " process.patDefaultSequence.replace(process.patTriggerEventSequence," ) jobCustomizations.append( " process.patTriggerSequence + process.patTriggerEventSequence)" ) #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile=configFile, jobInfo=jobInfo, newConfigFile=newConfigFile, sample_infos=samples, disableFactorization=disableFactorization, disableSysUncertainties=disableSysUncertainties, disableZrecoilCorrections=disableZrecoilCorrections, # We always copy the input files to the local directory # before running cmsRun, so just take the basname input_files=input_files_for_cfgOptions, output_file=os.path.basename(output_file), enableEventDumps=enableEventDumps, enableFakeRates=enableFakeRates, processName=processName, saveFinalEvents=saveFinalEvents, changeTauId=changeTauId, customizations=jobCustomizations) # Build a function that constructs our log file name given the # job file hash. if not os.path.exists('lxbatch_log'): os.makedirs('lxbatch_log') def log_file_maker(job_hash): return os.path.join( 'lxbatch_log', "_".join(['run', channel, sample, jobId, job_hash]) + '.log') # Build our batch job jobname, script = jobtools.make_bsub_script( output_file, input_files_and_jobs, log_file_maker, "cmsRun %s" % newConfigFile, pass_io_files=False) bsub_script_file = os.path.join( script_directory, "_".join([ 'analyze' + jobExtention, sample, 'job', str(job), input_file_hash ]) + '.sh') with open(bsub_script_file, 'w') as bsub_script: bsub_script.write(script) # Add this bsub to our submission script submit_file.write("bsub -q %s < %s\n" % (queue, bsub_script_file)) print len(tmp_files) garbage = tmp_files - relevant_files print len(garbage) if garbage: print "Found %i files not generated by this job!!" % len(garbage) print " You should really run:" print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm" with open('ana_garbage.txt', 'w') as garbage_script: for file in garbage: garbage_script.write('%s\n' % os.path.join(outputDirectory, file)) print "Run ./%s to submit jobs" % submit_file_name os.chmod(submit_file_name, 0755) return submit_file_name
configFilePath, configFile_original.replace("_cfg.py", "_%s%s_%s_cfg.py" % (sampleToAnalyze, jobType, jobNumber))) #print " configFileName = %s" % configFileName customizeConfigFile(sampleToAnalyze, jobNumber, version, inputFileNames_chunk, outputFileName, configFile_original, configFileName) logFileName = os.path.basename(configFileName.replace('_cfg.py', '.log')) #print " logFileName = %s" % logFileName def log_file_maker(job_hash): return os.path.join(logFilePath, logFileName) bsubId = "%s_%i" % (jobType, jobNumber) jobName, bsubScript = make_bsub_script( os.path.join(outputFilePath, outputFileName), input_files_and_jobs, log_file_maker, "cmsRun %s" % os.path.join(configFilePath, configFileName)) bsubFileNames[sampleToAnalyze][bsubId] = [ outputFileName ] bsubScriptFileName = os.path.join(configFilePath, logFileName.replace(".log", ".sh")) bsubScriptFile = open(bsubScriptFileName, "w") bsubScriptFile.write(bsubScript) bsubScriptFile.close() bsubScriptFileNames[sampleToAnalyze][bsubId] = bsubScriptFileName bsubJobName = "tauIdEffPATtuple_%s%s" % (sampleToAnalyze, bsubId) bsubJobNames[sampleToAnalyze][bsubId] = bsubJobName # create "master" shell script
def submitAnalysisToLXBatch(configFile = None, channel = None, samples = None, samplesToAnalyze = None, samplesToSkip = None, disableFactorization = False, disableSysUncertainties = False, disableZrecoilCorrections = False, script_directory=None, cfgdir = 'lxbatch', inputFileMap = None, outputFileMap = None, outputDirectory = None, queue = '1nd', enableEventDumps = False, enableFakeRates = False, processName = None, changeTauId = None, saveFinalEvents = False, jobExtention = ''): """ Submit analysis job (event selection, filling of histogram) to local machine """ # check that configFile, channel, samples and jobId # parameters are defined and non-empty for param in ["configFile", "channel", "samples", "outputDirectory"]: if locals()[param] is None: raise ValueError("Undefined '%s' parameter!!" % param) jobId = reg.getJobId(channel) # If not specified take script directory from user preferences. if script_directory is None: script_directory = reg.getHarvestScriptLocation() # Make sure our output file for the scripts is okay if not os.path.exists(script_directory): os.makedirs(script_directory) # Get all the files in our output directory that have non-zero size tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory) if x['size']) # Keep track of the files we care about relevant_files = set([]) submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh' with open(submit_file_name, 'w') as submit_file: # Loop over the samples to be analyzed for sample in samples['SAMPLES_TO_ANALYZE']: write_comment_header(submit_file, " Sample: " + sample) # Skip submitting crab job in case # o list of samples for which crab jobs are to be submitted has been # explicitely specified # o sample has explicitely been requested to be skipped if samplesToAnalyze: if sample not in samplesToAnalyze: print "Skipping", sample continue if samplesToSkip: if sample in samplesToSkip: print "Skipping", sample continue sample_info = samples['RECO_SAMPLES'][sample] # Make job info jobInfo = { 'channel' : channel, 'sample' : sample, 'id' : jobId } # Now build the scripts to feed to bsub # Find the input files input_files = list(inputFileMap(channel, sample, jobId)) if len(input_files) > 0: print("Submitting %s in %i part(s)" % (sample, len(input_files))) else: print("No local input files for %s found !!" % sample) for job, file in enumerate(input_files): input_files = [file] # The None in the tuple indicates this file has no dependencies in # the batch job. input_files_and_jobs = [ (None, file) for file in input_files ] # Need to prepend file:, and strip off the directory since we # always have bsub rfcp the input files to the working # directory. input_files_for_cfgOptions = [ 'file:' + os.path.basename(file) for file in input_files] output_file = outputFileMap(channel, sample, jobId) input_file_hash = jobtools.hash_files( input_files, add_time=False) # Add the hash of the input file so we know the provenance of all # files output_file = os.path.join(outputDirectory, output_file.replace( '.root', '_' + str(job) + '_' + input_file_hash + '.root')) relevant_files.add(os.path.basename(output_file)) # Uncomment to skip rerunning of old jobs #if os.path.basename(output_file) in tmp_files: #print " done; skipping", output_file #continue # First, prepare the configuration file newConfigFile = getNewConfigFileName( configFile, cfgdir, sample, jobId, index = job, label = "@lxbatch") write_comment_header(submit_file, " cfg: " + newConfigFile) #-------------------------------------------------------------------- # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement jobCustomizations = [] jobCustomizations.append("if hasattr(process, 'ntupleOutputModule'):") jobCustomizations.append(" process.ntupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append("if hasattr(process, 'patTupleOutputModule'):") jobCustomizations.append(" process.patTupleOutputModule.fileName = '%s'" % os.path.basename(output_file)) jobCustomizations.append("if hasattr(process, 'skimOutputModule'):") jobCustomizations.append(" process.skimOutputModule.fileName = '%s'" % os.path.basename(output_file)) HLTprocessName = 'HLT' if 'hlt' in samples['RECO_SAMPLES'][sample].keys(): HLTprocessName = samples['RECO_SAMPLES'][sample]['hlt'].getProcessName() jobCustomizations.append("if hasattr(process, 'hltMu'):") jobCustomizations.append(" process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName) jobCustomizations.append("if hasattr(process, 'patTrigger'):") jobCustomizations.append(" process.patTrigger.processName = '%s'" % HLTprocessName) jobCustomizations.append("if hasattr(process, 'patTriggerEvent'):") jobCustomizations.append(" process.patTriggerEvent.processName = '%s'" % HLTprocessName) if samples['RECO_SAMPLES'][sample]['type'] == 'Data': jobCustomizations.append("if hasattr(process, 'prePatProductionSequence')" + " and hasattr(process, 'prePatProductionSequenceGen'):") jobCustomizations.append(" process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)") jobCustomizations.append("if hasattr(process, 'ntupleProducer'):") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'tauGenJets'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'tauGenJets')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genJets'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genJets')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')") jobCustomizations.append(" if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):") jobCustomizations.append(" delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')") jobCustomizations.append("if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):") jobCustomizations.append(" process.patDefaultSequence.replace(process.patTriggerEventSequence,") jobCustomizations.append(" process.patTriggerSequence + process.patTriggerEventSequence)") #jobCustomizations.append("print process.dumpPython()") #-------------------------------------------------------------------- prepareConfigFile( configFile = configFile, jobInfo = jobInfo, newConfigFile = newConfigFile, sample_infos = samples, disableFactorization = disableFactorization, disableSysUncertainties = disableSysUncertainties, disableZrecoilCorrections = disableZrecoilCorrections, # We always copy the input files to the local directory # before running cmsRun, so just take the basname input_files = input_files_for_cfgOptions, output_file = os.path.basename(output_file), enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates, processName = processName, saveFinalEvents = saveFinalEvents, changeTauId = changeTauId, customizations = jobCustomizations) # Build a function that constructs our log file name given the # job file hash. if not os.path.exists('lxbatch_log'): os.makedirs('lxbatch_log') def log_file_maker(job_hash): return os.path.join( 'lxbatch_log', "_".join( ['run', channel, sample, jobId, job_hash]) + '.log') # Build our batch job jobname, script = jobtools.make_bsub_script( output_file, input_files_and_jobs, log_file_maker, "cmsRun %s" % newConfigFile, pass_io_files = False) bsub_script_file = os.path.join( script_directory, "_".join([ 'analyze'+jobExtention, sample, 'job', str(job), input_file_hash]) + '.sh') with open(bsub_script_file, 'w') as bsub_script: bsub_script.write(script) # Add this bsub to our submission script submit_file.write("bsub -q %s < %s\n" % (queue, bsub_script_file)) print len(tmp_files) garbage = tmp_files - relevant_files print len(garbage) if garbage: print "Found %i files not generated by this job!!" % len(garbage) print " You should really run:" print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm" with open('ana_garbage.txt', 'w') as garbage_script: for file in garbage: garbage_script.write( '%s\n' % os.path.join(outputDirectory, file)) print "Run ./%s to submit jobs" % submit_file_name os.chmod(submit_file_name, 0755) return submit_file_name
# The None in the tuple indicates that batch job has no dependencies on other batch jobs input_files_and_jobs = \ [ (None, os.path.join(inputFilePath, jobNameInRecoSampleDef, version, sampleToAnalyze, retVal_FWLiteTauFakeRateAnalyzer['inputFileNames'][i])) ] def log_file_maker(job_hash): log_fileName = os.path.join(logFilePath, retVal_FWLiteTauFakeRateAnalyzer['logFileNames'][i]) # CV: delete log-files from previous job submissions os.system("rm -f %s" % log_fileName) return log_fileName # Build script for batch job submission jobName, bsubScript = make_bsub_script( os.path.join(harvestingFilePath, retVal_FWLiteTauFakeRateAnalyzer['outputFileNames'][i]), input_files_and_jobs, log_file_maker, "%s %s" % (executable_FWLiteTauFakeRateAnalyzer, os.path.join(configFilePath, retVal_FWLiteTauFakeRateAnalyzer['configFileNames'][i]))) #print "configFilePath = %s" % configFilePath #print "retVal_FWLiteTauFakeRateAnalyzer['logFileNames'][i] = %s" % retVal_FWLiteTauFakeRateAnalyzer['logFileNames'][i] bsubScriptFileName = \ os.path.join(configFilePath, retVal_FWLiteTauFakeRateAnalyzer['logFileNames'][i].replace(".log", ".sh")) bsubScriptFile = open(bsubScriptFileName, "w") bsubScriptFile.write(bsubScript) bsubScriptFile.close() fileNames_FWLiteTauFakeRateAnalyzer[sampleToAnalyze][eventSelectionToAnalyze]['bsubScriptFileNames'].append( bsubScriptFileName)
def make_harvest_scripts(plot_regex, skim_regex, channel = "", sampleToAnalyze = "", job_id = None, # An iterable that gives the input files input_source = None, # Pass input_files_info explicitely in case input files # do not yet exist an time when make_harvest_scripts is called input_files_info = None, # Allow to switch between 'genericHarvester.py' and 'hadd' harvester_command = _HARVESTER_CMD, # Abort harvesting/merging scripts in case input files fail to get copied abort_on_rfcp_error = True, # Where to put the output castor_output_directory = None, script_directory = None, merge_script_name = None, local_copy_mapper = None, chunk_size = 1e9, # 1 GB max_input_files_per_chunk = 50, run_harvesting = True, run_merging = True, check_old_files = True, max_bsub_concurrent_file_access = 2000, verbosity = 1): # Get the jobId from the user registry if job_id is None: job_id = reg.getJobId(channel) if script_directory is None: script_directory = reg.getHarvestScriptLocation() # Create the directory where we store the scripts if it doesn't exist if not os.path.exists(script_directory): os.mkdir(script_directory) if input_files_info is None: # Get all files with nonzero size in the input castor directory print "Getting files to harvest from input, input_source = %s" % input_source input_files_info = [ x for x in input_source if x['size'] ] #print "input_files_info = %s" % input_files_info print "Getting files from destination" # Get all the tmp files (that are non-zero) if castor_output_directory.find("/castor") != -1: tmp_files_info = [ x for x in castor_source(castor_output_directory) if x['size'] ] elif castor_output_directory.find("/store") != -1: tmp_files_info = [ x for x in eos_source(castor_output_directory) if x['size'] ] else: local_output_directory = None if castor_output_directory.find(":") != -1: local_output_directory = castor_output_directory[castor_output_directory.find(":") + 1:] else: local_output_directory = castor_output_directory tmp_files_info = [] for x in os.listdir(local_output_directory): file_info = { 'path' : os.path.join(local_output_directory, os.path.basename(x)), 'size' : 1, # dummy 'time' : time.localtime(), 'file' : os.path.basename(x), 'permissions' : 'mrw-r--r--' # "ordinary" file access permissions } tmp_files_info.append(file_info) tmp_files = set(x['file'] for x in tmp_files_info) #print "tmp_files_info = %s" % tmp_files_info # Make a repository of info about our files all_files_dict = {} for file_info in input_files_info + tmp_files_info: all_files_dict[file_info['path']] = file_info #print "all_files_dict = %s" % all_files_dict # Keep track of files that we put in tmp with these jobs and that we care # about. We can stop caring about old files if after adding new files (i.e. # crab jobs finishing), the merge block they were in contains a different # set of files. relevant_tmp_files = set([]) # Keep track of the final harvested output final_harvest_files = [] harvest_script_name = "_".join(['submit', channel, sampleToAnalyze, job_id, 'harvest']) + '.sh' harvest_log = open('_'.join(('harvest', channel, sampleToAnalyze, job_id, 'log')) + '.txt', 'a') # Keep track of the names of lxbatch jobs bsub_job_names = [] if run_harvesting: # Select those that match our given regex for a sample plot_file_map = defaultdict(list) # Keep track of a hash of plot_matcher = re.compile(plot_regex) plot_source_hashes = defaultdict(hashlib.md5) for file in input_files_info: match = plot_matcher.match(file['file']) if match: full_file = file['path'] sample = None if sampleToAnalyze != "": sample = sampleToAnalyze else: sample = match.group('sample') plot_file_map[sample].append(full_file) plot_source_hashes[sample].update(full_file) harvest_log.write('=========== %s ================' % time.asctime()) for sample in sorted(plot_source_hashes.keys()): # Write down the hash of the input files harvest_log.write( " ".join([channel, sample, job_id, '<harvest>' 'input[%s]' % plot_source_hashes[sample].hexdigest()])+'\n') submit_file = open(harvest_script_name, 'w') # Make the bsub scripts submit_file.write("#!/bin/bash\n") # Now build mergers for each of the samples # Count how many bsubs we have created, so we can put pauses in to # thwart the rate limit. bsub_file_access_counter = 0 for sample in plot_file_map.keys(): #if sample.find('NoPU') == -1: # continue # Add helpful comments write_comment_header(submit_file, "Harvesting channel %s, sample %s" % (channel, sample)) print " Building harvesting for channel %s, sample %s" % (channel, sample) print " -- Found %i files to harvest" % len(plot_file_map[sample]) # Build merge tree. We add the source has to the sample name. split = 6 if 'PPmuX' in sample: print "High yield sample %s detected, setting split to 4" % sample split = 4 merge_jobs = jobtools.make_merge_dependency_tree( "_".join([channel, sample, job_id]), plot_file_map[sample], local_output_directory, split = split) #print "merge_jobs = %s" % merge_jobs # Only do work that hasn't been done before. We can check and see # if the output of a given merge layer is already in the temp # directory. As the filenames contain a suffix with the hash of the # input file names, we can be sure that if a file is out of date we # will notice. merge_jobs_needed = [] files_to_build = set([]) print " --- Generated %i harvest layers:" % len(merge_jobs) for i, layer in enumerate(merge_jobs): # Figure out how many files we need to build layer_jobs_needed = [] for layer_job in layer: #print "layer_job = ", layer_job # Check if we've already built this output file in the tmp file_base_name = os.path.basename(layer_job[0]) needed = True # Check if we are rebuilding a dependency building_a_dependency = any( file in files_to_build for file in layer_job[1]) if not building_a_dependency and file_base_name in tmp_files: output_m_time = all_files_dict[layer_job[0]]['time'] out_of_date = False for input_file in layer_job[1]: #print "input_file = ", input_file if not input_file in all_files_dict.keys() or all_files_dict[input_file]['time'] > output_m_time: print "File: %s is older than its dependency %s, rebuilding!" % ( file_base_name, input_file) # Check if it's out of date out_of_date = True break if not out_of_date: needed = False if needed: layer_jobs_needed.append(layer_job) # Keep track of the relevant files, so we can delete old # cruft relevant_tmp_files.add(file_base_name) # Check if this is the final output layer if len(layer) == 1: final_harvest_files.append((sample, layer[0][0])) print " ---- layer %i has %i jobs, of which %i not done" % ( i, len(layer), len(layer_jobs_needed)) merge_jobs_needed.append(layer_jobs_needed) # Keep track of what jobId was used for a paticular output file job_registry = {} # If a file is not produced by a job (already exists in CASTOR/EOS), # then the job ID returned is none. get_job_name = lambda x : x in job_registry and job_registry[x] or None for ilayer, layer in enumerate(merge_jobs_needed): write_comment_header(submit_file, "Layer %i" % ilayer) submit_file.write("echo Submitting layer %i of channel %s, sample %s\n" % (ilayer, channel, sample)) for ijob, (output_file, input_files) in enumerate(layer): # Get the job name (if it exists) and file name for the # input files. input_files_and_jobs = [ (get_job_name(file), file) for file in input_files] # Build a function that constructs our log file name given the # job file hash. # Build a function that constructs our log file name given the # job file hash. if not os.path.exists('lxbatch_log'): os.makedirs('lxbatch_log') def log_file_maker(job_hash): log_fileName = os.path.join( 'lxbatch_log', "_".join( ['harvest', job_hash, 'layer_%i' % ilayer]) + '.log') # CV: delete log-files from previous job submissions os.system("rm -f %s" % log_fileName) return log_fileName # Build the script job_name, script = jobtools.make_bsub_script( output_file, input_files_and_jobs, log_file_maker, harvester_command, abort_on_rfcp_error = abort_on_rfcp_error, label = "_".join([ "harvest", channel, sample, "layer", str(ilayer), "job", str(ijob)])) #print "job_name = %s" % job_name bsub_job_names.append(job_name) # Register our job #print "--> registering channel %s, sample %s, jobId %s" % (channel, sample, job_id) #print " script_directory = %s" % script_directory job_registry[output_file] = job_name #print " job_registry[%s] = %s" % (output_file, job_registry[output_file]) script_file = os.path.join( script_directory, "_".join([ "harvest", channel, sample, job_id, "layer", str(ilayer), "job", str(ijob)] ) + ".sh") submit_file.write("bsub < %s\n" % script_file) # Keep track of how many files we access bsub_file_access_counter += split if bsub_file_access_counter > max_bsub_concurrent_file_access: bsub_file_access_counter = 0 submit_file.write("# thwart rate limit\n") submit_file.write( "echo Sleeping for 500 seconds, it is now:\n") submit_file.write( "date\n") submit_file.write("sleep 500\n") with open(script_file, 'w') as script_file: script_file.write(script) submit_file.close() if run_merging: # Now build skim merging file # Select those that match our given regex for a sample skim_file_map = defaultdict(list) skim_fileinhash_map = defaultdict(hashlib.md5) skim_fileouthash_map = defaultdict(hashlib.md5) skim_matcher = re.compile(skim_regex) for file in input_files_info: match = skim_matcher.match(file['file']) if match: full_file = file['path'] # Parse the sample from the regex sample = None if sampleToAnalyze != "": sample = sampleToAnalyze else: sample = match.group('sample') # For the skims, keep track of the file size well, since we use it # to group the jobs. skim_file_map[sample].append( (file['time'], file['size'], full_file)) # Keep track of the hash of all input files so we know what went # into our output skim_fileinhash_map[sample].update(full_file) def make_skim_name(sample, chunk, hash): " Generate a nice name for an output skim " return "_".join(["skim", sample, "chunk", str(chunk), hash]) + ".root" if merge_script_name is None: merge_script_name = "_".join(['submit', channel, sampleToAnalyze, job_id, 'merge']) + '.sh' with open(merge_script_name, 'w') as merge_script: merge_jobs_counter = 0 bsub_file_access_counter = 0 for sample in skim_file_map.keys(): write_comment_header(merge_script, " Merging " + sample) print "Merging channel %s, sample %s" % (channel, sample) files = skim_file_map[sample] num_files = len(files) total_file_size = sum(map(lambda x: x[1], files))/1e6 # Divide the job up into chunks that are about 1 GB each in size chunks = list(jobtools.split(files, chunk_size, max_input_files_per_chunk, lambda x: x[1])) print " Total sample size: %i files, %i MB - splitting into %i chunks" % ( num_files, total_file_size, len(chunks)), # Keep track of jobs we are actually running skim_merge_jobs = [] for ichunk, input_files in enumerate(chunks): # Figure out the name for our file. It contains a hash of its # inputs. We don't add the time, as we never have any LXBatch # job dependencies. just_the_files = [x[2] for x in input_files] output_file = make_skim_name( "_".join([channel, sample, job_id]), ichunk, jobtools.hash_files( just_the_files, add_time=False)) skim_fileouthash_map[sample].update(output_file) relevant_tmp_files.add(output_file) if output_file not in tmp_files: output_file_full_path = os.path.join( castor_output_directory, output_file) # Add "None" as the job id of the input files to indicate we # don't care about any dependencies. The index on x takes # out only the filename, not the size or the time. skim_merge_jobs.append( (output_file_full_path, map(lambda x: (None, x[2]), input_files)) ) print " -- %i chunks are already done, skipping" % ( len(chunks) - len(skim_merge_jobs)) for ijob, (output_file, input_files) in enumerate(skim_merge_jobs): def merge_log_file_maker(job_hash): log_fileName = os.path.join( 'lxbatch_log', "_".join( ['merge', job_hash, 'job_%i' % ijob]) + '.log') # CV: delete log-files from previous job submissions os.system("rm -f %s" % log_fileName) return log_fileName # Generate script contents job_name, script = jobtools.make_bsub_script( output_file, input_files, merge_log_file_maker, _MERGER_CMD, abort_on_rfcp_error = abort_on_rfcp_error, label = "_".join([ "merge", channel, sample, "chunk", str(ijob)])) script_file = os.path.join( script_directory, "_".join([ "merge", channel, sample, job_id, "chunk", str(ijob)]) + ".sh") # Add our bsub command merge_script.write("bsub < %s\n" % script_file) merge_jobs_counter += 1 bsub_file_access_counter += len(input_files) if bsub_file_access_counter > max_bsub_concurrent_file_access: bsub_file_access_counter = 0 merge_script.write("# thwart rate limit\n") merge_script.write( "echo Sleeping for 500 seconds, it is now:\n") merge_script.write('date\n') merge_script.write("sleep 500\n") with open(script_file, 'w') as script_file: script_file.write(script) print " Built %i merge jobs in %s" % (merge_jobs_counter, merge_script_name) for sample in sorted(skim_fileinhash_map.keys()): # Write down the hash of the input files harvest_log.write( " ".join([job_id, sample, '<merge>' 'input[%s]' % skim_fileinhash_map[sample].hexdigest(), 'output[%s]' % skim_fileouthash_map[sample].hexdigest(), ])+'\n') if check_old_files: # Compute all the extra tmp files extra_crap = tmp_files - relevant_tmp_files if extra_crap: print " Found %i extra files from previous harvest jobs"%len(extra_crap) print " Writing these files to file garbage.txt. To delete, please run: " print " cat garbage.txt | xargs -P 10 -n 1 rfrm " with open('garbage.txt', 'w') as garbage_file: for file in extra_crap: full_path = os.path.join(castor_output_directory, file) garbage_file.write(full_path + '\n') local_copy_script = 'copy_harvest_local_%s.sh' % job_id with open(local_copy_script, 'w') as copy_script: for sample, file in final_harvest_files: copy_script.write('rfcp %s %s &\n' % ( file, local_copy_mapper(sample))) copy_script.write('wait\n') if verbosity > 0: if run_harvesting: print "To harvest plots, run %s" % harvest_script_name if run_merging: print "To merge skims, run %s" % merge_script_name print "After harvesting is done, run %s to copy files locally" % local_copy_script # Make all our stuff executable os.chmod(local_copy_script, 0755) if run_harvesting: os.chmod(harvest_script_name, 0755) if run_merging: os.chmod(merge_script_name, 0755) # Return name of scripts retVal = {} retVal['harvest_script_name'] = harvest_script_name retVal['bsub_job_names'] = bsub_job_names retVal['final_harvest_files'] = final_harvest_files retVal['merge_script_name'] = merge_script_name retVal['local_copy_script'] = local_copy_script return retVal