def generate_sbatch_lines( executable, command_line_parameters, input_file_names, output_file_names, script_file_names, log_file_names, working_dir, max_num_jobs, cvmfs_error_log=None, pool_id='', cmssw_base_dir=None, verbose=False, job_template_file='sbatch-node.sh.template', dry_run=False, validate_outputs=True, min_file_size=20000, use_home=True, ): if not pool_id: raise ValueError('pool_id is empty') lines_sbatch = [ "from tthAnalysis.HiggsToTauTau.sbatchManager import sbatchManager", "", "m = sbatchManager('%s', verbose = %s, dry_run = %s, use_home = %s, min_file_size = %d)" % \ (pool_id, verbose, dry_run, use_home, min_file_size), "m.setWorkingDir('%s')" % working_dir, "m.setcmssw_base_dir('%s')" % cmssw_base_dir, "m.log_completion = %s" % verbose, ] num_jobs = 0 for key_file, command_line_parameter in command_line_parameters.items(): log_file_name = None if log_file_names: log_file_name = log_file_names[key_file] if num_jobs <= max_num_jobs or max_num_jobs <= 0: sbatch_line = generate_sbatch_line( executable=executable, command_line_parameter=command_line_parameter, input_file_names=input_file_names[key_file], output_file_name=output_file_names[key_file], script_file_name=script_file_names[key_file], log_file_name=log_file_name, cvmfs_error_log=cvmfs_error_log, job_template_file=job_template_file, validate_outputs=validate_outputs, min_file_size=min_file_size, ) if sbatch_line: lines_sbatch.append(sbatch_line) num_jobs = num_jobs + 1 if max_num_jobs > 0 and num_jobs > max_num_jobs: logging.warning( "number of jobs = %i exceeds limit of %i --> skipping submission of %i jobs !!" % \ (num_jobs, max_num_jobs, num_jobs - max_num_jobs) ) lines_sbatch.append("m.waitForJobs()") return lines_sbatch, num_jobs
def submit(self, cmd_str): nof_max_retries = 10 current_retry = 0 while current_retry < nof_max_retries: # Run command cmd_outerr = run_cmd(cmd_str, return_stderr=True) try: job_id = cmd_outerr[0].split()[-1] break except IndexError: # Fails if stdout returned by the last line is empty logging.warning( "Caught an error: '%s'; resubmitting %i-th time" % (cmd_outerr[1], current_retry)) current_retry += 1 logging.debug("sleeping for %i seconds." % 60) time.sleep( 60 ) # Let's wait for 60 seconds until the next resubmission # The job ID must be a number, so.. we have to check if it really is one try: int(job_id) except ValueError: raise ValueError("job_id = '%s' NaN; sbatch stdout = '%s'; sbatch stderr = '%s'" % \ (job_id, cmd_outerr[0], cmd_outerr[1])) if job_id in self.submittedJobs: raise RuntimeError("Same job ID: %s" % job_id) # Is a valid job ID return job_id
def check_submission_cmd(submission_out, submission_cmd, throw=False): earlier_submission_out = find_earlier_version(submission_out) current_submission = ' '.join(submission_cmd) if submission_cmd else str( submission_cmd) current_submission_stripped = current_submission.replace(' -A', '').replace( ' -E', '') if earlier_submission_out: with open(earlier_submission_out, 'r') as earlier_submission_file: lines = [] for line in earlier_submission_file: lines.append(line.rstrip('\n')) assert (len(lines) == 1) previous_submission = lines[0] previous_submission_stripped = previous_submission.replace( ' -A', '').replace(' -E', '') if previous_submission_stripped != current_submission_stripped: logging.warning( "Current command ('{}') does not match to the previously run command ('{}')" .format(current_submission, previous_submission)) if throw: sys.exit(1) do_run = query_yes_no( "Sure you want to resubmit with a different command?", default="no") if not do_run: logging.info('Exiting') sys.exit(0) with open(submission_out, 'w') as submission_out_file: submission_out_file.write('{}\n'.format(current_submission))
def get_dir_entries(self, path_obj): if not os.path.isdir(path_obj.name_fuse): raise hdfsException("No such path: %s" % path_obj.name_fuse) entries = [] try: entries = [ nohdfs.info(os.path.join(path_obj.name_fuse, entry)) for entry in os.listdir(path_obj.name_fuse) ] except OSError as err: if err.errno == errno.EAGAIN: logging.warning('Could not access path %s because: %s' % (path_obj.name_fuse, err)) entries = [] except Exception as err: raise ValueError('Could not access path %s because: %s' % (path_obj.name_fuse, err)) return entries
def get_ratios(wobtag_count, wbtag_count, wobtag_label, wbtag_label): ratios = [] for bin_idx in range(len(wobtag_count)): wobtag_count_bin_idx = wobtag_count[bin_idx] wbtag_count_bin_idx = wbtag_count[bin_idx] ratio = 1. if wobtag_count_bin_idx == 0. and wbtag_count_bin_idx == 0.: pass elif wbtag_count_bin_idx == 0.: if abs(wobtag_count_bin_idx) > 1e-2: raise RuntimeError( 'Found bin idx %d in histogram %s with zero events but %.2f events in histogram %s' % \ (bin_idx, wbtag_label, wobtag_count_bin_idx, wobtag_label) ) else: ratio = 1. elif wobtag_count_bin_idx == 0.: if abs(wobtag_count_bin_idx) > 1e-2: raise RuntimeError( 'Found bin idx %d in histogram %s with zero events but %.2f events in histogram %s' % \ (bin_idx, wobtag_label, wbtag_count_bin_idx, wbtag_label) ) else: ratio = 1. else: ratio = wobtag_count_bin_idx / wbtag_count_bin_idx if ratio < 0.: logging.warning( 'Found event sums with opposite sign at bin {} in histograms {} and {}: {:.2f} and {:.2f} ' '-> setting ratio to 1 instead'.format( bin_idx, wobtag_label, wbtag_label, wobtag_count_bin_idx, wbtag_count_bin_idx)) ratio = 1. ratios.append(ratio) return ratios
def get_rles(input_paths, whitelist, blacklist, read_all_systematics): has_errors = False rles = collections.OrderedDict() valid_paths = get_paths(input_paths, whitelist, blacklist) for channel_name, channel_dir in valid_paths.items(): rles[channel_name] = collections.OrderedDict() for region_dir in sorted(hdfs.listdir(channel_dir)): region_name = os.path.basename(region_dir) logging.debug('Found region {} in channel {}'.format( channel_name, region_name)) rles[channel_name][region_name] = collections.OrderedDict() for sample_dir in sorted(hdfs.listdir(region_dir)): sample_name = os.path.basename(sample_dir) if sample_name in SAMPLES_EXCLUDE: continue logging.debug( 'Found sample {} in region {} and channel {}'.format( sample_name, region_name, channel_name)) rles[channel_name][region_name][ sample_name] = collections.OrderedDict() for rle_dir in sorted(hdfs.listdir(sample_dir)): central_or_shift = os.path.basename(rle_dir) if central_or_shift in SYSTEMATICS_EXCLUDE: continue if not read_all_systematics and central_or_shift != SYSTEMATICS_CENTRAL: continue logging.debug( 'Found systematics {} for sample {} in region {} and channel {}' .format(central_or_shift, sample_name, region_name, channel_name)) rles[channel_name][region_name][sample_name][ central_or_shift] = [] rle_filenames = sorted(hdfs.listdir(rle_dir)) if not rle_filenames: logging.warning( 'Directory {} is empty'.format(rle_dir)) continue rle_arr = [] for rle_filename in rle_filenames: if not rle_filename.endswith('.txt'): raise RuntimeError( "Unexpected extension in file: %s" % rle_filename) with open(rle_filename, 'r') as rle_file: for line in rle_file: line_stripped = line.rstrip('\n') if not REGEX_RLE.match(line_stripped): raise RuntimeError( "Unexpected line found in %s: %s" % (rle_filename, line_stripped)) rle = line_stripped if rle in rle_arr: logging.error( "Duplicate event %s found in channel %s, region %s, sample %s, systematics %s" % \ (rle, channel_name, region_name, sample_name, central_or_shift) ) has_errors = True continue rle_arr.append(rle) logging.debug( 'Found {} events in sample {}, region {}, systematics {}, channel {}' .format(len(rle_arr), sample_name, region_name, central_or_shift, channel_name)) rles[channel_name][region_name][sample_name][ central_or_shift].extend(rle_arr) return rles, has_errors
def create(self): """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info['use_it']: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") if not is_mc: continue logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable, process_name)) inputFileList_map = generateInputFileList(sample_info, 1) key_dir = getKey(process_name) key_file = getKey(process_name) self.inputFiles[key_file] = list( itertools.chain(*inputFileList_map.values())) if len(self.inputFiles[key_file]) == 0: logging.warning("'%s' = %s --> skipping job !!" % (key_file, self.inputFiles[key_file])) continue outputFile = os.path.join(self.dirs[key_dir][DKEY_RESULTS], "%s.txt" % process_name) self.outputFiles[key_file] = outputFile if os.path.isfile(outputFile): logging.info('File {} already exists --> skipping job'.format( outputFile)) continue self.cfgFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "refGenWeight_%s_cfg.txt" % (process_name)) self.logFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "refGenWeight_%s.log" % (process_name)) self.scriptFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "refGenWeight_%s_cfg.sh" % (process_name)) self.plotFiles[key_file] = ' '.join([ os.path.join(self.dirs[key_dir][DKEY_PLOTS], "refGenWeight_%s.%s" % (process_name, extension)) for extension in ['pdf', 'png'] ]) self.jobOptions_sbatch[key_file] = { 'inputFiles': self.inputFiles[key_file], 'cfgFile_path': self.cfgFiles[key_file], 'cmdParams': "-i {} -o {} -p {} -v".format( self.cfgFiles[key_file], self.outputFiles[key_file], self.plotFiles[key_file], ), 'outputFile': self.outputFiles[key_file], 'logFile': self.logFiles[key_file], 'scriptFile': self.scriptFiles[key_file], } self.createCfg(self.jobOptions_sbatch[key_file]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable) self.num_jobs['refGenWeight'] += self.createScript_sbatch( self.executable, self.sbatchFile, self.jobOptions_sbatch) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile(lines_makefile) self.addToMakefile_final(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") logging.info("Building dictionaries for sample %s..." % process_name) for charge_selection in self.charge_selections: central_or_shift_extensions = ["", "hadd", "addBackgrounds"] central_or_shifts_extended = central_or_shift_extensions + self.central_or_shifts for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [process_name, "hadd"] for process_name_or_dummy in process_name_extended: if central_or_shift_or_dummy in [ "hadd" ] and process_name_or_dummy in ["hadd"]: continue if central_or_shift_or_dummy != "central" and central_or_shift_or_dummy not in central_or_shift_extensions: if not is_mc: continue if not self.accept_central_or_shift( central_or_shift_or_dummy, sample_info): continue key_dir = getKey(process_name_or_dummy, charge_selection, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES ]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, "_".join([charge_selection]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, "_".join([charge_selection]), process_name_or_dummy) for subdirectory in ["comp_jetToTauFakeRate", "makePlots"]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0 frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100 * numDirectories_created >= frac * numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_info, self.max_files_per_job) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] inputFileList = inputFileLists[sample_name] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") sample_category = sample_info["sample_category"] for charge_selection in self.charge_selections: for central_or_shift in self.central_or_shifts: if central_or_shift != "central" and not is_mc: continue if not self.accept_central_or_shift( central_or_shift, sample_info): continue # build config files for executing analysis code key_analyze_dir = getKey(process_name, charge_selection, central_or_shift) for jobId in inputFileList.keys(): analyze_job_tuple = (process_name, charge_selection, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning( "No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue cfgFile_modified_path = os.path.join( self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple) histogramFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles': ntupleFiles, 'cfgFile_modified': cfgFile_modified_path, 'histogramFile': histogramFile_path, 'logFile': logFile_path, 'chargeSelection': charge_selection, 'jet_minPt': self.jet_minPt, 'jet_maxPt': self.jet_maxPt, 'jet_minAbsEta': self.jet_minAbsEta, 'jet_maxAbsEta': self.jet_maxAbsEta, 'hadTau_selection_tight': self.hadTau_selection_tight, 'hadTauSelection_denominator': self.hadTau_selection_denominator, 'hadTauSelections_numerator': self.hadTau_selections_numerator, 'trigMatchingOptions': self.trigMatchingOptions, 'selEventsFileName_output': rleOutputFile_path, 'absEtaBins': self.absEtaBins, 'decayModes': self.decayModes, 'central_or_shift': central_or_shift, 'central_or_shifts_local': [], 'apply_hlt_filter': self.hlt_filter, } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job], sample_info) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, charge_selection) hadd_stage1_job_tuple = (process_name, charge_selection) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[ key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[ key_hadd_stage1_job].append( self.jobOptions_analyze[key_analyze_job] ['histogramFile']) self.outputFile_hadd_stage1[ key_hadd_stage1_job] = os.path.join( self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple) # initialize input and output file names for hadd_stage2 key_hadd_stage1_job = getKey(process_name, charge_selection) key_hadd_stage2_dir = getKey("hadd", charge_selection) key_hadd_stage2_job = getKey(charge_selection) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] self.inputFiles_hadd_stage2[key_hadd_stage2_job].append( self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.outputFile_hadd_stage2[ key_hadd_stage2_job] = os.path.join( self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s.root" % charge_selection) logging.info( "Creating configuration files for executing 'comp_jetToTauFakeRate'" ) for charge_selection in self.charge_selections: charge_key = "comp_%s" % charge_selection self.comp_input_files[charge_key] = [] for trigMatchingOption in self.trigMatchingOptions: key_hadd_stage2_job = getKey(charge_selection) key_comp_jetToTauFakeRate_dir = getKey("comp_jetToTauFakeRate") key_comp_jetToTauFakeRate_job = getKey(charge_selection, trigMatchingOption) self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job] = { 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], "comp_jetToTauFakeRate_%s_%s_cfg.py" % (charge_selection, trigMatchingOption)), 'outputFile': os.path.join( self.dirs[DKEY_HIST], "comp_jetToTauFakeRate_%s_%s.root" % (charge_selection, trigMatchingOption)), 'logFile': os.path.join( self.dirs[DKEY_LOGS], "comp_jetToTauFakeRate_%s_%s.log" % (charge_selection, trigMatchingOption)), 'looseRegion': "jetToTauFakeRate_%s_%s/denominator/" % (charge_selection, trigMatchingOption), 'tightRegion': "jetToTauFakeRate_%s_%s/numerator/" % (charge_selection, trigMatchingOption), 'absEtaBins': self.absEtaBins, 'ptBins': self.ptBins, 'decayModes': self.decayModes, 'hadTauSelections': self.hadTau_selections_numerator, 'trigMatchingOption': trigMatchingOption, 'plots_outputFileName': os.path.join( self.dirs[key_comp_jetToTauFakeRate_dir] [DKEY_PLOT], "comp_jetToTauFakeRate_%s.png" % trigMatchingOption) } self.createCfg_comp_jetToTauFakeRate( self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job]) comp_output = self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job]['outputFile'] self.targets.append(comp_output) self.comp_input_files[charge_key].append(comp_output) self.comp_output_files[charge_key] = os.path.join( self.dirs[DKEY_HIST], "comp_jetToTauFakeRate_%s.root" % charge_selection) logging.info("Creating configuration files to run 'makePlots'") for charge_selection in self.charge_selections: key_hadd_stage2_job = getKey(charge_selection) key_makePlots_dir = getKey("makePlots") key_makePlots_job = getKey(charge_selection) self.jobOptions_make_plots[key_makePlots_job] = { 'executable': self.executable_make_plots, 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified': os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), 'outputFile': os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel), 'histogramDir': "jetToTauFakeRate_%s" % charge_selection, 'label': None, 'make_plots_backgrounds': self.make_plots_backgrounds } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) for trigMatchingOption in self.trigMatchingOptions: self.cfgFile_make_plots = self.cfgFile_make_plots_denominator for absEtaBin in ["absEtaLt1_5", "absEta1_5to9_9"]: key_hadd_stage2_job = getKey(charge_selection) key_makePlots_job = getKey(charge_selection, trigMatchingOption, absEtaBin, "denominator") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join( self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_%s_%s_denominator_%s_cfg.py" % \ (self.channel, charge_selection, trigMatchingOption, absEtaBin)), 'outputFile' : os.path.join( self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_%s_%s_denominator_%s.png" % (self.channel, charge_selection, trigMatchingOption, absEtaBin)), 'histogramDir' : "jetToTauFakeRate_%s_%s/denominator/%s" % (charge_selection, trigMatchingOption, absEtaBin), 'label' : None, 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) for hadTau_selection_numerator in self.hadTau_selections_numerator: key_hadd_stage2_job = getKey(charge_selection) key_makePlots_job = getKey(charge_selection, trigMatchingOption, absEtaBin, "numerator", hadTau_selection_numerator) self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join( self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_%s_%s_numerator_%s_%s_cfg.py" % \ (self.channel, charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin)), 'outputFile' : os.path.join( self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_%s_%s_numerator_%s_%s.png" % \ (self.channel, charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin)), 'histogramDir' : "jetToTauFakeRate_%s_%s/numerator/%s/%s" % (charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin), 'label' : None, 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.sbatchFile_comp_jetToTauFakeRate = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_comp_jetToTauFakeRate.py") if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_comp_jetToTauFakeRate) self.createScript_sbatch(self.executable_comp_jetToTauFakeRate, self.sbatchFile_comp_jetToTauFakeRate, self.jobOptions_comp_jetToTauFakeRate) lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile, make_dependency="phony_hadd_stage1", max_mem='4096M') self.addToMakefile_comp_jetToTauFakeRate(lines_makefile) self.addToMakefile_comp_hadd(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def validate(output_dir, verbose=False): '''Validates the job execution carried out by dump_rle_parallel() Args: output_dir: string, The directory where all RLE files are stored verbose: bool, Enable verbose output Returns: None The validation is quite basic: the program will loop over the subdirectories of output_dir, matches them against the dictionary entries specified by sample variable and counts the number of lines in each RLE file. If the number of files doesn't match to the number of entries in the corresponding ROOT file, the user will be notified about such discrepancies. In principle, the script could also print relevant commands to fix the issues (and dump them to an easily executable file) but let's leave it for another time. ''' if verbose: logging.getLogger().setLevel(logging.DEBUG) root_file_regex = re.compile('^tree_(\d+).root$') file_dict = {k: [] for k in ['excess', 'missing', 'corrupted']} try: for s_key, s_value in samples.iteritems(): sample_name = s_value['process_name_specific'] sample_dir = os.path.join(output_dir, sample_name) if os.path.isdir(sample_dir): logging.debug("Found sample directory {sample_dir}".format( sample_dir=sample_dir)) #NB! assume that there are no secondary paths in the dictionary (hence index 0!) sample_path_dict = s_value['local_paths'][0] sample_path = sample_path_dict['path'] blacklist = sample_path_dict['blacklist'] for sample_subdir in os.listdir(sample_path): sample_subpath_idx = -1 try: sample_subpath_idx = int(sample_subdir) except ValueError: continue if sample_subpath_idx < 0: raise ValueError("Internal error") sample_subpath = os.path.join(sample_path, sample_subdir) logging.debug( "Processing sample subdirectory {sample_subpath}". format(sample_subpath=sample_subpath)) for sample_file in os.listdir(sample_subpath): sample_file_fullpath = os.path.join( sample_subpath, sample_file) if not sample_file.endswith( '.root') or not os.path.isfile( sample_file_fullpath): continue root_file_regex_match = root_file_regex.search( sample_file) if not root_file_regex_match: continue root_file_idx = int(root_file_regex_match.group(1)) expected_rle_file_basename = '{root_file_idx}.txt'.format( root_file_idx=root_file_idx) expected_rle_file = os.path.join( sample_dir, expected_rle_file_basename) file_dict_entry = (expected_rle_file, sample_file_fullpath) if root_file_idx in blacklist: if os.path.isfile(expected_rle_file): logging.warning( 'Found RLE file {rle_file} (corresponding to blacklisted {root_file}) ' 'which you ought to delete'.format( rle_file=expected_rle_file, root_file=sample_file_fullpath, )) file_dict['excess'].append(file_dict_entry) continue if not os.path.isfile(expected_rle_file): logging.warning( 'Missing RLE file {rle_file} (corresponding to {root_file})' .format( rle_file=expected_rle_file, root_file=sample_file_fullpath, )) file_dict['missing'].append(file_dict_entry) continue nof_rle_events = raw_linecount(expected_rle_file) if nof_rle_events == 1 and os.path.getsize( expected_rle_file) == 1: # the RLE file contains only a newline, hence no events nof_rle_events = 0 root_file = ROOT.TFile(sample_file_fullpath, 'read') root_tree = root_file.Get('tree') nof_entries = root_tree.GetEntries() nof_events_diff = nof_rle_events - nof_entries if nof_events_diff < 0: logging.error( 'Missing {nof_events} events in {rle_filename} (corresponding to {sample_file}): ' 'expected {expected}, got {actual}'.format( nof_events=abs(nof_events_diff), rle_filename=expected_rle_file, sample_file=sample_file_fullpath, expected=nof_entries, actual=nof_rle_events, )) file_dict['corrupted'].append(file_dict_entry) elif nof_events_diff > 0: logging.error( 'Got {nof_events} more event than expected in {rle_filename} (corresponding ' 'to {sample_file}): expected {expected}, got {actual}' .format( nof_events=nof_events_diff, rle_filename=expected_rle_file, sample_file=sample_file_fullpath, expected=nof_entries, actual=nof_rle_events, )) file_dict['corrupted'].append(file_dict_entry) else: logging.debug( 'File {rle_filename} (corresponding to {sample_file}) looks OK' .format( rle_filename=expected_rle_file, sample_file=sample_file_fullpath, )) except KeyboardInterrupt: pass if any(map(bool, file_dict.values())): logging.info('Validation finished with errors') for key in file_dict.keys(): if file_dict[key]: logging.info('Number of {key} RLE files: {nof_key}'.format( key=key, nof_key=len(file_dict[key]))) for entry in file_dict[key]: logging.info('{rle_file} <=> {sample_file}'.format( rle_file=entry[0], sample_file=entry[1])) else: logging.info('Validation finished successfully') return
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") logging.info("Building dictionaries for sample %s..." % process_name) central_or_shift_extensions = ["", "hadd", "addBackgrounds"] central_or_shifts_extended = central_or_shift_extensions + self.central_or_shifts for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [ process_name, "hadd" ] for process_name_or_dummy in process_name_extended: if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]: if not is_mc: continue if not self.accept_central_or_shift(central_or_shift_or_dummy, sample_info): continue key_dir = getKey(process_name_or_dummy, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, process_name_or_dummy) for subdirectory in [ "addBackgrounds", "prepareDatacards" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_HIST, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_COMBINE_OUTPUT ]: initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_HADD_RT, DKEY_PLOT, DKEY_COMBINE_OUTPUT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0; frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100*numDirectories_created >= frac*numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) inputFileList = inputFileLists[sample_name] is_mc = (sample_info["type"] == "mc") sample_category = sample_info["sample_category"] for central_or_shift in self.central_or_shifts: if central_or_shift != "central" and not is_mc: continue if not self.accept_central_or_shift(central_or_shift, sample_info): continue key_analyze_dir = getKey(process_name, central_or_shift) for jobId in inputFileList.keys(): analyze_job_tuple = (process_name, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue rleOutputFile = os.path.join( self.dirs[key_analyze_dir][DKEY_RLES], "rle_{channel}_{process_name}_{central_or_shift}_{jobId}_%s_%s.txt".format( channel = self.channel, process_name = process_name, central_or_shift = central_or_shift, jobId = jobId, )) if self.select_rle_output else "" cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%i.log" % analyze_job_tuple) histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%i.root" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : cfgFile_modified_path, 'histogramFile' : histogramFile_path, 'selEventsFileName_output' : rleOutputFile, 'logFile' : logFile_path, 'absEtaBins_e' : self.absEtaBins_e, 'ptBins_e' : self.ptBins_e, 'absEtaBins_mu' : self.absEtaBins_mu, 'ptBins_mu' : self.ptBins_mu, 'central_or_shift' : central_or_shift, 'fillGenEvtHistograms' : self.fillGenEvtHistograms, 'triggers_mu_cfg' : "leptonFR_triggers['{}']['{}']".format(self.era, 'mu'), 'triggers_e_cfg' : "leptonFR_triggers['{}']['{}']".format(self.era, 'e'), 'lep_mva_cut_e' : float(self.lep_mva_cut_e), 'lep_mva_cut_mu' : float(self.lep_mva_cut_mu), } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name) key_hadd_stage1_job = getKey(process_name) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s.root" % process_name) # initialize input and output file names for hadd_stage1_5 key_hadd_stage1_5_dir = getKey("hadd") key_hadd_stage1_5_job = getKey('') if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] for key_hadd_stage1_job in self.outputFile_hadd_stage1.keys(): self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST], "hadd_stage1_5.root" ) # sum fake contributions for the total of all MC samples # input processes: TTj,... ## HERE !! # output process: fakes_mc key_hadd_stage1_5_job = getKey('') key_addBackgrounds_dir = getKey("addBackgrounds") key_addBackgrounds_job_sum = getKey("fakes_mc") sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend(self.ttHProcs) processes_input = [] for sample_category in sample_categories: processes_input.append("%sj" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_sum] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_cfg.py" % "fakes_mc"), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s.root" % "fakes_mc"), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s.log" % "fakes_mc"), 'categories' : [ "LeptonFakeRate/numerator/electrons_tight", "LeptonFakeRate/denominator/electrons_fakeable", "LeptonFakeRate/numerator/muons_tight", "LeptonFakeRate/denominator/muons_fakeable" ], 'processes_input' : processes_input, 'process_output' : "fakes_mc", 'histogramsToCopy' : list(self.histograms_to_fit.keys()), 'sysShifts' : [] } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_sum]) # create configuration files to run 'addBackgrounds_LeptonFakeRate' key_addBackgrounds_job_leptonFR = getKey('') self.jobOptions_addBackgrounds_LeptonFakeRate[key_addBackgrounds_job_leptonFR] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], os.path.basename(self.cfgFile_addBackgrounds_LeptonFakeRate)), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackground_LeptonFakeRate.root"), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(self.cfgFile_addBackgrounds_LeptonFakeRate.replace("_cfg.py", ".log")) ), } self.createCfg_addBackgrounds_LeptonFakeRate(self.jobOptions_addBackgrounds_LeptonFakeRate[key_addBackgrounds_job_leptonFR]) # create configuration files to run 'addBackgrounds_Convs_LeptonFakeRate' key_addBackgrounds_job_conv = getKey('') self.jobOptions_addBackgrounds_Convs_LeptonFakeRate[key_addBackgrounds_job_conv] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], os.path.basename(self.cfgFile_addBackgrounds_Convs_LeptonFakeRate)), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackground_Convs_LeptonFakeRate.root"), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(self.cfgFile_addBackgrounds_Convs_LeptonFakeRate.replace("_cfg.py", ".log")) ), } self.createCfg_addBackgrounds_Convs_LeptonFakeRate(self.jobOptions_addBackgrounds_Convs_LeptonFakeRate[key_addBackgrounds_job_conv]) # initialize input and output file names for hadd_stage2 key_hadd_stage2_dir = getKey("hadd") key_hadd_stage2_job = getKey('') if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] # CV: hadd_stage_1_5 output file does not need to be added as input for hadd_stage_2, # as addBackgrounds_LeptonFakeRate output file contains all histograms except fakes_mc self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_sum]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_LeptonFakeRate[key_addBackgrounds_job_leptonFR]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_Convs_LeptonFakeRate[key_addBackgrounds_job_conv]['outputFile']) self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2.root") # We need to generate the eta and pt bins for electrons and muons lepton_bins = {} categories = [] for lepton in ['electron', 'muon']: if lepton not in lepton_bins: lepton_bins[lepton] = {} absEtaBins = None ptBins = None lepton_short = None if lepton == 'electron': absEtaBins = self.absEtaBins_e ptBins = self.ptBins_e lepton_short = 'e' elif lepton == 'muon': absEtaBins = self.absEtaBins_mu ptBins = self.ptBins_mu lepton_short = 'mu' else: raise ValueError('Invalid lepton type: %s' % lepton) for selection in ['tight', 'fakeable']: if selection not in lepton_bins[lepton]: lepton_bins[lepton][selection] = [] num_or_den = None if selection == 'tight': num_or_den = 'numerator' elif selection == 'fakeable': num_or_den = 'denominator' else: raise ValueError('Invalid lepton selection: %s' % selection) for absEtaBin_idx in range(0, len(absEtaBins) - 1): absEtaBinLowerEdge = absEtaBins[absEtaBin_idx] absEtaBinUpperEdge = absEtaBins[absEtaBin_idx + 1] absEtaBinString = getEtaBin(absEtaBinLowerEdge, absEtaBinUpperEdge) for ptBin_idx in range(0, len(ptBins) - 1): ptBinsLowerEdge = ptBins[ptBin_idx] ptBinsUpperEdge = ptBins[ptBin_idx + 1] ptBinString = getPtBin(ptBinsLowerEdge, ptBinsUpperEdge) absEta_and_ptBinString = '%s_%s' % (absEtaBinString, ptBinString) lepton_bins[lepton][selection].append( construct_lepton_params( lepton, lepton_short, selection, absEta_and_ptBinString, error_msg = "No fit parameter range specified for abs(eta) range = (%.3f, %.3f) and " "pT range = (%.3f, %.3f) for lepton type '%s' !!" % \ (absEtaBinLowerEdge, absEtaBinUpperEdge, ptBinsLowerEdge, ptBinsUpperEdge, lepton) ) + (absEtaBinLowerEdge, absEtaBinUpperEdge, ptBinsLowerEdge, ptBinsUpperEdge, 0) ) categories.append( ( "LeptonFakeRate/%s/%ss_%s/%s/%s" % (num_or_den, lepton, selection, absEtaBinString, ptBinString), "%ss_%s_%s_shapes" % (lepton, selection, absEta_and_ptBinString), ) ) # Let's also add inclusive category lepton_bins[lepton][selection].append( construct_lepton_params( lepton, lepton_short, selection, 'incl', error_msg = "No fit parameter range specified for lepton type %s" % lepton ) + (-1., -1., -1., -1., 1) ) categories.append( ( "LeptonFakeRate/%s/%ss_%s/incl" % (num_or_den, lepton, selection), "%ss_%s_incl_shapes" % (lepton, selection), ) ) lepton_bins_merged = [] for lepton_type in lepton_bins: for lepton_selection in lepton_bins[lepton_type]: lepton_bins_merged.extend(lepton_bins[lepton_type][lepton_selection]) if self.prep_dcard: logging.info("Creating configuration files to run 'prepareDatacards_LeptonFakeRate'") datacards = [] for histogramToFit in self.histograms_to_fit: key_prep_dcard_dir = getKey("prepareDatacards") key_prep_dcard_job = getKey(histogramToFit) datacard = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s.root" % (histogramToFit)) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_LeptonFakeRate_%s_cfg.py" % histogramToFit), 'datacardFile' : datacard, 'histogramDir' : (self.histogramDir_prep_dcard), 'histogramToFit' : histogramToFit, 'label' : None, 'categories' : categories, } datacards.append(datacard) self.createCfg_prep_dcard_LeptonFakeRate(self.jobOptions_prep_dcard[key_prep_dcard_job]) # Create setupDatacards_LeptonFakeRate.py script from the template systematics_leptonFR = [] for systematic in self.central_or_shifts: if systematic == 'central': continue systematic_name = systematic.replace('Up', '').replace('Down', '') if systematic_name not in systematics_leptonFR: systematics_leptonFR.append(systematic_name) setup_dcards_template_file = os.path.join(jinja_template_dir, 'setupDatacards_LeptonFakeRate.py.template') with open(setup_dcards_template_file, 'r') as setup_dcards_template_file_ptr: setup_dcards_template = setup_dcards_template_file_ptr.read() setup_dcards_script = jinja2.Template(setup_dcards_template).render( leptons = lepton_bins_merged, central_or_shifts = systematics_leptonFR, signal_process = "QCD" if self.use_QCD_fromMC else "data_fakes", ) setup_dcards_script_path = os.path.join(self.dirs[DKEY_SCRIPTS], 'setupDatacards_LeptonFakeRate.py') logging.debug("writing setupDatacards_LeptonFakeRate script file = '%s'" % setup_dcards_script_path) with codecs.open(setup_dcards_script_path, "w", "utf-8") as setup_dcards_script_file: setup_dcards_script_file.write(setup_dcards_script) setup_dcards_script_file.flush() os.fsync(setup_dcards_script_file.fileno()) add_chmodX(setup_dcards_script_path) if self.use_QCD_fromMC: postfit_plot_script_path = os.path.join(os.environ['CMSSW_BASE'], 'src/tthAnalysis/HiggsToTauTau/data/leptonFR/scripts/postFitPlot_fakes_from_mc.py') yieldtable_script_path = os.path.join(os.environ['CMSSW_BASE'], 'src/tthAnalysis/HiggsToTauTau/data/leptonFR/scripts/yieldTable_fakes_from_mc.py') else: postfit_plot_script_path = os.path.join(os.environ['CMSSW_BASE'], 'src/tthAnalysis/HiggsToTauTau/data/leptonFR/scripts/postFitPlot_fakes_from_data.py') yieldtable_script_path = os.path.join(os.environ['CMSSW_BASE'], 'src/tthAnalysis/HiggsToTauTau/data/leptonFR/scripts/yieldTable_fakes_from_data.py') # Create run_postFit.sh script from the template combine_output_dir = os.path.join(self.dirs[DKEY_COMBINE_OUTPUT], 'output') postfit_template_file = os.path.join(jinja_template_dir, 'run_postFit.sh.template') with open(postfit_template_file, 'r') as postfit_template_file_ptr: postfit_template = postfit_template_file_ptr.read() for lepton in ['electron', 'muon']: for selection in ['fakeable', 'tight']: is_num = selection == 'tight' for params in lepton_bins[lepton][selection]: l_array, l_range, l_sub_dir, l_eta_low, l_eta_high, l_pt_low, l_pt_high, l_is_inclusive = params postfit_script = jinja2.Template(postfit_template).render( new_cmssw_base = self.cmssw_base_dir_combine, setup_dcards_script = setup_dcards_script_path, postfit_plot_script = postfit_plot_script_path, int_lumi_data = self.lumi, yieldtable_script = yieldtable_script_path, output_dir = combine_output_dir, numerator_plotLabel = self.numerator_plotLabel, denominator_plotLabel = self.denominator_plotLabel, l_array = l_array, l_range = l_range, l_sub_dir = l_sub_dir, l_eta_low = l_eta_low, l_eta_high = l_eta_high, l_pt_low = l_pt_low, l_pt_high = l_pt_high, l_is_inclusive = l_is_inclusive, is_num = is_num, numerator_output_dir = os.path.join(combine_output_dir, 'mlfit_LeptonFakeRate_%s' % self.numerator_histogram), denominator_output_dir = os.path.join(combine_output_dir, 'mlfit_LeptonFakeRate_%s' % self.denominator_histogram), selection = selection, lepton_letter = 'e' if lepton == 'electron' else 'mu', grep_value = "QCD" if self.use_QCD_fromMC else "data_fakes", ) postfit_script_path = os.path.join( self.dirs[DKEY_SCRIPTS], 'mlfit_%s_%s.sh' % (self.numerator_histogram if is_num else self.denominator_histogram, l_array) ) logging.debug("Writing run_postFit script file = '%s'" % postfit_script_path) with codecs.open(postfit_script_path, "w", "utf-8") as postfit_script_file: postfit_script_file.write(postfit_script) postfit_script_file.flush() os.fsync(postfit_script_file.fileno()) add_chmodX(postfit_script_path) key_prep_dcard_dir = getKey("prepareDatacards") fit_value_file = os.path.join(combine_output_dir, 'fit_values.txt') makefile_template_file = os.path.join(jinja_template_dir, 'Makefile_postFit.template') makefile_template = open(makefile_template_file, 'r').read() makefile_templatized = jinja2.Template(makefile_template).render( new_cmssw_base = self.cmssw_base_dir_combine, setup_dcards_script = setup_dcards_script_path, numerator_histogram = self.numerator_histogram, denominator_histogram = self.denominator_histogram, scripts_dir = self.dirs[DKEY_SCRIPTS], numerator_datacard = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s.root" % self.numerator_histogram), denominator_datacard = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s.root" % self.denominator_histogram), output_dir = combine_output_dir, numerator_output_dir = os.path.join(combine_output_dir, 'mlfit_LeptonFakeRate_%s' % self.numerator_histogram), denominator_output_dir = os.path.join(combine_output_dir, 'mlfit_LeptonFakeRate_%s' % self.denominator_histogram), lepton_bins = lepton_bins, fit_values = fit_value_file, ) makefile_path = os.path.join(self.dirs[DKEY_SCRIPTS], 'Makefile_postFit') logging.debug("Writing run_postFit script file = '%s'" % makefile_path) with codecs.open(makefile_path, "w", "utf-8") as makefile_path_file: makefile_path_file.write(makefile_templatized) makefile_path_file.flush() os.fsync(makefile_path_file.fileno()) self.jobOptions_combine = { 'inputFile' : ' '.join(datacards), 'outputFile' : fit_value_file, 'makefile_path' : makefile_path, 'logFile' : os.path.join(self.dirs[DKEY_LOGS], 'postFit.log'), } key_comp_LeptonFakeRate = getKey('') leptonFR_final_output = os.path.join(combine_output_dir, 'leptonFakeRates.root') self.jobOptions_comp_LeptonFakeRate[key_comp_LeptonFakeRate] = { 'inputFile' : [ fit_value_file, self.outputFile_hadd_stage2[key_hadd_stage2_job] ], 'outputFile' : leptonFR_final_output, 'absEtaBins_e' : self.absEtaBins_e, 'ptBins_e' : self.ptBins_e, 'absEtaBins_mu' : self.absEtaBins_mu, 'ptBins_mu' : self.ptBins_mu, 'logFile' : os.path.join(self.dirs[DKEY_LOGS], os.path.basename(self.cfgFile_comp_LeptonFakeRate).replace('_cfg.py', '.log')), 'cfgFile_modified' : os.path.join(self.dirs[DKEY_CFGS], os.path.basename(self.cfgFile_comp_LeptonFakeRate)), 'plots_outputFileName' : os.path.join(self.dirs[DKEY_PLOT], "comp_LeptonFakeRate.png") } self.createCfg_comp_LeptonFakeRate(self.jobOptions_comp_LeptonFakeRate[key_comp_LeptonFakeRate]) self.targets.append(self.jobOptions_comp_LeptonFakeRate[key_comp_LeptonFakeRate]['outputFile']) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_LeptonFakeRate.py") self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_LeptonFakeRate.py") self.sbatchFile_addBackgrounds_LeptonFakeRate = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_LeptonFakeRate.py") self.sbatchFile_addBackgrounds_Convs_LeptonFakeRate = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_Convs_LeptonFakeRate.py") self.sbatchFile_comp_LeptonFakeRate = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_comp_LeptonFakeRate.py") if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) self.createScript_sbatch(self.executable_addBackgrounds_recursively, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum) self.createScript_sbatch(self.executable_addBackgrounds_LeptonFakeRate, self.sbatchFile_addBackgrounds_LeptonFakeRate, self.jobOptions_addBackgrounds_LeptonFakeRate) self.createScript_sbatch(self.executable_addBackgrounds_LeptonFakeRate, self.sbatchFile_addBackgrounds_Convs_LeptonFakeRate, self.jobOptions_addBackgrounds_Convs_LeptonFakeRate) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_comp_LeptonFakeRate) self.createScript_sbatch(self.executable_comp_LeptonFakeRate, self.sbatchFile_comp_LeptonFakeRate, self.jobOptions_comp_LeptonFakeRate) lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) ## this step now does both e Conv, data_fakes and fakes_mc computation # self.addToMakefile_backgrounds_from_MC(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile, make_dependency = " ".join([ "phony_addBackgrounds_LeptonFakeRate", "phony_addBackgrounds_Convs_LeptonFakeRate", "phony_addBackgrounds_sum" ])) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_combine(lines_makefile) self.addToMakefile_comp_LeptonFakeRate(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def create(self): """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info['use_it']: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") if not is_mc: continue logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable, process_name)) inputFileList = generateInputFileList(sample_info, self.max_files_per_job) key_dir = getKey(process_name) outputFile = os.path.join(self.dirs[key_dir][DKEY_HISTO], "%s.root" % process_name) if os.path.isfile(outputFile) and tools_is_file_ok( outputFile, min_file_size=2000): logging.info('File {} already exists --> skipping job'.format( outputFile)) continue self.outputFiles[process_name] = { 'inputFiles': [], 'outputFile': outputFile } for jobId in inputFileList.keys(): key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = inputFileList[jobId] if len(self.inputFiles[key_file]) == 0: logging.warning( "ntupleFiles['%s'] = %s --> skipping job !!" % (key_file, self.inputFiles[key_file])) continue self.cfgFiles_puProfile[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "puProfile_%s_%i_cfg.txt" % (process_name, jobId)) self.outputFiles_tmp[key_file] = os.path.join( self.dirs[key_dir][DKEY_HISTO_TMP], "histogram_%i.root" % jobId) self.logFiles_puProfile[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "puProfile_%s_%i.log" % (process_name, jobId)) self.scriptFiles_puProfile[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "puProfile_%s_%i_cfg.sh" % (process_name, jobId)) self.jobOptions_sbatch[key_file] = { 'histName': process_name, 'inputFiles': self.inputFiles[key_file], 'cfgFile_path': self.cfgFiles_puProfile[key_file], 'outputFile': self.outputFiles_tmp[key_file], 'logFile': self.logFiles_puProfile[key_file], 'scriptFile': self.scriptFiles_puProfile[key_file], } self.createCfg_puProfile(self.jobOptions_sbatch[key_file]) self.outputFiles[process_name]['inputFiles'].append( self.outputFiles_tmp[key_file]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable) self.num_jobs['puProfile'] += self.createScript_sbatch( self.executable, self.sbatchFile_puProfile, self.jobOptions_sbatch) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_puProfile(lines_makefile) self.addToMakefile_hadd(lines_makefile) self.addToMakefile_plot(lines_makefile) self.addToMakefile_finalHadd(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") logging.info("Building dictionaries for sample %s..." % process_name) for lepton_selection in self.lepton_selections: central_or_shift_extensions = ["", "hadd", "addBackgrounds"] central_or_shifts_extended = central_or_shift_extensions + self.central_or_shifts for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [ process_name, "hadd" ] for process_name_or_dummy in process_name_extended: if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]: continue if central_or_shift_or_dummy != "central" and central_or_shift_or_dummy not in central_or_shift_extensions: if not is_mc: continue if not self.accept_central_or_shift(central_or_shift_or_dummy, sample_info): continue key_dir = getKey(process_name_or_dummy, lepton_selection, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, "_".join([ lepton_selection ]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, "_".join([ lepton_selection ]), process_name_or_dummy) for subdirectory in [ "prepareDatacards" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_COMBINE_OUTPUT ]: initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_COMBINE_OUTPUT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0; frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100*numDirectories_created >= frac*numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) for lepton_selection in self.lepton_selections: for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") inputFileList = inputFileLists[sample_name] for central_or_shift in self.central_or_shifts: if central_or_shift != "central" and not is_mc: continue # build config files for executing analysis code key_analyze_dir = getKey(process_name, lepton_selection, central_or_shift) for jobId in inputFileList.keys(): analyze_job_tuple = (process_name, lepton_selection, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : cfgFile_modified_path, 'histogramFile' : histogramFile_path, 'logFile' : logFile_path, 'selEventsFileName_output' : rleOutputFile_path, 'leptonSelection' : lepton_selection, 'applyFakeRateWeights' : "disabled", 'central_or_shift' : central_or_shift, } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection) hadd_stage1_job_tuple = (process_name, lepton_selection) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1.keys(): self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple) # initialize input and output file names for hadd_stage2 key_hadd_stage1_job = getKey(process_name, lepton_selection) key_hadd_stage2_dir = getKey("hadd", lepton_selection) key_hadd_stage2_job = getKey(lepton_selection) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2.keys(): self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s.root" % lepton_selection) logging.info("Creating configuration files to run 'prepareDatacards'") processesToCopy = [] for process in self.prep_dcard_processesToCopy: processesToCopy.append(process) self.prep_dcard_processesToCopy = processesToCopy processesToCopy = [] for process in self.prep_dcard_signals: processesToCopy.append(process) self.prep_dcard_signals = processesToCopy for histogramToFit in self.histograms_to_fit: key_hadd_stage2_job = getKey("Tight") key_prep_dcard_dir = getKey("prepareDatacards") prep_dcard_job_tuple = (self.channel, histogramToFit) key_prep_dcard_job = getKey(histogramToFit) datacardFile = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : datacardFile, 'histogramDir' : self.histogramDir_prep_dcard, 'histogramToFit' : histogramToFit, 'label' : None } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) jobOptions_makefile = copy.deepcopy(self.jobOptions_postFit) jobOptions_makefile['fit_result'] = os.path.join( self.dirs[DKEY_COMBINE_OUTPUT], 'fit_{}'.format(histogramToFit), jobOptions_makefile['target'] ) jobOptions_makefile['hadd_stage2'] = self.outputFile_hadd_stage2[key_hadd_stage2_job] jobOptions_makefile['prepare_datacard'] = datacardFile jobOptions_makefile['data_datacard'] = os.path.join( self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_data_%s_%s.root" % prep_dcard_job_tuple ) jobOptions_makefile['pseudodata_datacard'] = os.path.join( self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_pseudodata_%s_%s.root" % prep_dcard_job_tuple ) jobOptions_makefile['makefile'] = os.path.join( self.dirs[DKEY_COMBINE_OUTPUT], 'Makefile_{}'.format(histogramToFit) ) jobOptions_makefile['stdout'] = os.path.join( self.dirs[DKEY_COMBINE_OUTPUT], 'stdout_{}.log'.format(histogramToFit) ) self.createCfg_postFit(jobOptions_makefile) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile, make_dependency = "phony_hadd_stage1") self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_postFit(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def plot(input_files, output_files, title, expected_neff, mode): histogram_dict = {} for sample_name, sample_entry in input_files.items(): if not hdfs.isfile(sample_entry['input']): logging.error('Could not find file {}'.format(sample_entry['input'])) continue root_file = ROOT.TFile.Open(sample_entry['input'], 'read') logging.debug('Opened file {}'.format(sample_entry['input'])) root_directories = list(filter( lambda root_dir: root_dir != None, [ root_file.Get(os.path.join(key.GetName(), mode, 'genEvt')) \ for key in root_file.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile' ] )) if len(root_directories) != 1: raise RuntimeError('Expected single directory in %s' % sample_entry['input']) root_dir = root_directories[0] histogram_dirs = [ root_dir.Get(key.GetName()) \ for key in root_dir.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile' ] if len(histogram_dirs) != 1: raise RuntimeError( 'Expected single directory containing lumiScale histograms in %s' % sample_entry['input'] ) histogram_dir = histogram_dirs[0] histograms = [ key.GetName() for key in histogram_dir.GetListOfKeys() \ if key.GetClassName().startswith('TH1') and 'lumiScale' in key.GetName() ] for histogram_name_actual in histograms: histogram_name = histogram_name_actual.replace('_lumiScale', '').replace('CMS_ttHl_', '') \ if histogram_name_actual != 'lumiScale' else 'central' histogram = histogram_dir.Get(histogram_name_actual).Clone() histogram.SetDirectory(0) if histogram.GetEntries() != sample_entry['nentries'] and mode == 'unbiased': raise RuntimeError('Expected {} entries from {} in file {}, but got {} entries'.format( sample_entry['nentries'], histogram_name, sample_entry['input'], histogram.GetEntries(), )) if histogram_name not in histogram_dict: histogram_dict[histogram_name] = { 'histogram' : histogram, 'nentries' : histogram.GetEntries(), 'nfiles' : 1, } else: histogram_dict[histogram_name]['histogram'].Add(histogram) histogram_dict[histogram_name]['nentries'] += histogram.GetEntries() histogram_dict[histogram_name]['nfiles'] += 1 root_file.Close() if not histogram_dict: logging.error('Could not find histograms for samples {}'.format(', '.join(list(input_files.keys())))) return if len(set(histogram_dict[histogram_name]['nfiles'] for histogram_name in histogram_dict)) != 1: raise RuntimeError( 'Inconsistent number of files found for samples %s' % ', '.join(list(input_files.keys())) ) if len(set(histogram_dict[histogram_name]['nentries'] for histogram_name in histogram_dict)) != 1: raise RuntimeError( 'Inconsistent number of entries found in samples %s' % ', '.join(list(input_files.keys())) ) min_y = -1 max_y = -1 nentries = -1 for histograms in histogram_dict.values(): histogram = histograms['histogram'] y_content = histogram.GetBinContent(1) y_error = histogram.GetBinError(1) y_down = y_content - y_error y_up = y_content + y_error if min_y < 0: min_y = y_down if max_y < 0: max_y = y_up if y_down < min_y: min_y = y_down if y_up > max_y: max_y = y_up if nentries < 0: nentries = histograms['nentries'] else: assert(nentries == histograms['nentries']) if not (y_down < expected_neff < y_up) and mode == 'unbiased': logging.warning( "Effective event count {} not within {} +- {}".format(expected_neff, y_content, y_error) ) if mode == 'unbiased': min_y = min(min_y, expected_neff) max_y = max(max_y, expected_neff) diff = 0.2 * (max_y - min_y) min_y -= diff max_y += diff canvas = ROOT.TCanvas('c', 'c', 1200, 900) canvas.SetGrid() ROOT.gStyle.SetOptStat(0) legend = ROOT.TLegend(0.1, 0.7, 0.48, 0.9) legend.SetHeader('N_{eff} (%d entries)' % nentries) expected_histogram = None line_width = 3 marker_style = 20 fill_style = 4000 lines = [] for idx, histogram_name in enumerate(sorted(histogram_dict.keys())): histogram = histogram_dict[histogram_name]['histogram'] color = 2 + idx histogram.SetTitle(title) histogram.SetAxisRange(min_y, max_y, "Y") histogram.SetLineColor(color) histogram.SetMarkerColor(color) histogram.SetLineWidth(line_width) histogram.SetMarkerStyle(marker_style) histogram.SetFillStyle(fill_style) histogram.Draw("l e1%s" % (" same" if idx > 0 else "")) y_content = histogram.GetBinContent(1) y_error = histogram.GetBinError(1) y_up = y_content + y_error y_down = y_content - y_error bin_width = histogram.GetBinWidth(1) bin_center = histogram.GetBinCenter(1) line_min_x = bin_center - bin_width / 4 line_max_x = bin_center + bin_width / 4 line_down = ROOT.TLine(line_min_x, y_down, line_max_x, y_down) line_down.SetLineColor(color) line_down.SetLineWidth(line_width) line_down.Draw() lines.append(line_down) line_up = ROOT.TLine(line_min_x, y_up, line_max_x, y_up) line_up.SetLineColor(color) line_up.SetLineWidth(line_width) line_up.Draw() lines.append(line_up) sig_digits = max(8 - int(math.ceil(math.log10(y_content))), 1) if y_content > 0. else 1 leg_pattern = '%s (%.{}f #pm %.{}f)'.format(sig_digits, sig_digits) leg_name = leg_pattern % (histogram_name, y_content, y_error) legend.AddEntry(histogram, leg_name) logging.debug( 'Effective event count for the sys unc option {} is {} +- {}'.format( histogram_name, y_content, y_error ) ) if not expected_histogram and mode == 'unbiased': expected_histogram = histogram.Clone() expected_histogram.Reset() expected_histogram.SetBinContent(1, expected_neff) expected_histogram.SetBinError(1, 0) expected_histogram.SetLineColor(ROOT.kBlack) expected_histogram.SetMarkerColor(ROOT.kBlack) expected_histogram.SetLineWidth(line_width) expected_histogram.SetMarkerStyle(marker_style) expected_histogram.SetLineStyle(9) expected_histogram.SetFillStyle(fill_style) if expected_histogram: logging.debug('Expecting {} events'.format(expected_neff)) expected_histogram.Draw("e2 same") legend.AddEntry(expected_histogram, 'expected (%.1f)' % expected_neff) legend.Draw() for output_file in output_files: canvas.SaveAs(output_file) canvas.Close() legend.Delete() if expected_histogram: expected_histogram.Delete() for histogram_name in histogram_dict: histogram_dict[histogram_name]['histogram'].Delete() for line in lines: line.Delete()
continue fp = 'root://cms-xrd-global.cern.ch/%s' % file_cand[0] try: f = ROOT.TFile.Open(fp, 'read') f.Close() except Exception: break logging.debug('Selected file {} ({} events)'.format( file_cand[0], file_cand[1])) selected_files.append(fp) if args.nof_files > 0 and len(selected_files) >= args.nof_files: break dy_samples[dy_sample] = selected_files for dy_sample, files in dy_samples.items(): if not files: logging.warning('Could not find a file for {}'.format(dy_sample)) continue logging.info('Plotting {}'.format(dy_sample)) output_file_basename = dy_sample.split('/')[1].replace('-', '_') if 'ext' in dy_sample: output_file_basename += '_ext' output_file_fullpath_wo_ext = os.path.join(args.output_dir, output_file_basename) output_files = map( lambda ext: '{}.{}'.format(output_file_fullpath_wo_ext, ext), args.extensions) title = ' + '.join( map(lambda f: os.path.basename(f).replace('.root', ''), files)) plot(files, output_files, title, output_file_basename)
logging.getLogger().setLevel(logging.DEBUG) rle_file = args.file sample_name = args.sample_name output_file = args.output grep_directory = args.directory grep_individually = args.all try: sample_name_re = re.compile(sample_name) except: logging.error( "Argument {arg} not a valid regex".format(arg=sample_name)) sys.exit(1) if grep_individually and not grep_directory: logging.warning( 'Option -a/--all has no effect unless you specify -d/--directory') if not hdfs.isfile(rle_file): logging.error("No such file: '{rle_filename}'".format( rle_filename=rle_file, )) sys.exit(1) if output_file and not hdfs.isdir(os.path.dirname(output_file)): logging.error( "Parent directory of '{output_file}' doesn't exist".format( output_file=output_file, )) sys.exit(1) if grep_directory and not hdfs.isdir(grep_directory): logging.error("Grep directory '{grep_directory}' doesn't exist".format( grep_directory=grep_directory, ))
if era == "2016" and sample_name.startswith(("/DoubleMuon/", "/DoubleEG/", "/Tau/")) and \ sample_name.find("PromptReco-v3") == -1: sample_info["use_it"] = False if __name__ == '__main__': logging.info( "Running the jobs with the following systematic uncertainties enabled: %s" % \ ', '.join(central_or_shifts) ) if sample_filter: samples = filter_samples(samples, sample_filter) if tau_id_wp: logging.warning( "Changing hadTau_selection_denominator from {} to {}".format( hadTau_selection_denominator, tau_id_wp)) hadTau_selection_denominator = tau_id_wp analysis = analyzeConfig_jetToTauFakeRate( configDir=os.path.join("/home", getpass.getuser(), "ttHAnalysis", era, version), outputDir=os.path.join("/hdfs/local", getpass.getuser(), "ttHAnalysis", era, version), executable_analyze="analyze_jetToTauFakeRate", samples=samples, charge_selections=["OS"], jet_minPt=20., jet_maxPt=1.e+6, jet_minAbsEta=-1., jet_maxAbsEta=2.3,
def create(self): """Creates all necessary config files and runs the MEM -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) # read the file in, sample-by-sample # build the dictionary recursively # add rle file also to generated cfg files # print integrations per job as well! # consider more than 1 file per jobs -- the jobs are splitted by MEM integration anyways rle_filters = self.get_filter() if self.rle_filter_file else {} statistics = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue if not os.path.exists(sample_info['local_paths'][0]['path']): logging.warning("Skipping sample {sample_name}".format(sample_name = sample_name)) continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_addMEM, process_name)) is_mc = (sample_info["type"] == "mc") if self.rle_filter_file: assert(process_name in rle_filters) inputFileList = generateInputFileList(sample_info, self.max_files_per_job) # typically, the analysis ends here and starts looping b/c the smallest unit of work processes # at least one file; we need, however, to split the file into event ranges in such a way that # each job performs mem_integrations_per_job MEM integrations # so what we are going to do is to open each set of files in inputFileList, read the variable # requestMEM_*l_*tau and try to gather the event ranges such that each event range # performs up to mem_integrations_per_job integrations per job memEvtRangeDict = self.memJobList(inputFileList, rle_filters[process_name] if self.rle_filter_file else []) for jobId in memEvtRangeDict.keys(): key_dir = getKey(sample_name) key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = memEvtRangeDict[jobId]['input_fileset'] # there should always be a job assert(self.inputFiles[key_file] > 0), "More than one input file: %s ?? !!" % \ ', '.join(self.inputFiles[key_file]) #assert(len(self.inputFiles[key_file]) == 1), "There is more than one input file!" self.cfgFiles_addMEM_modified[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i_cfg.py" % (self.channel, process_name, jobId) ) self.shFiles_addMEM_modified[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i.sh" % (self.channel, process_name, jobId) ) self.outputFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_NTUPLES], "%s_%i.root" % (process_name, jobId) ) self.logFiles_addMEM[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "addMEM_%s_%s_%i.log" % (self.channel, process_name, jobId) ) self.logFiles_addMEM[key_file] = get_log_version((self.logFiles_addMEM[key_file],))[0] self.createCfg_addMEM( self.inputFiles[key_file], memEvtRangeDict[jobId]['event_range'][0], memEvtRangeDict[jobId]['event_range'][1], self.outputFiles[key_file], self.era, sample_info["sample_category"], is_mc, self.cfgFiles_addMEM_modified[key_file], memEvtRangeDict[jobId]['whitelist'], ) # associate the output file with the fileset_id #UDPATE: ONE OUTPUT FILE PER SAMPLE! fileset_id = memEvtRangeDict[jobId]['fileset_id'] hadd_output_dir = os.path.join( self.dirs[key_dir][DKEY_FINAL_NTUPLES], '%04d' % (fileset_id // 1000) ) if not os.path.exists(hadd_output_dir): os.makedirs(hadd_output_dir) hadd_output = os.path.join( hadd_output_dir, '%s_%i.root' % ('tree', fileset_id) # UDPATE: ADDED #hadd_output_dir, "tree.root" # UDPATE: REMOVED ) if hadd_output not in self.hadd_records: self.hadd_records[hadd_output] = {} self.hadd_records[hadd_output]['output_files'] = [] self.hadd_records[hadd_output]['fileset_id'] = fileset_id self.hadd_records[hadd_output]['output_files'].append(self.outputFiles[key_file]) self.hadd_records[hadd_output]['process_name'] = process_name #self.filesToClean.append(self.outputFiles[key_file]) # let's sum the number of integration per sample nofEntriesMap = {} for v in memEvtRangeDict.values(): if v['fileset_id'] not in nofEntriesMap: nofEntriesMap[v['fileset_id']] = { 'nof_entries' : v['nof_entries'], } statistics[process_name] = { 'nof_int' : sum([entry['nof_int'] for entry in memEvtRangeDict.values()]), 'nof_entries' : sum([entry['nof_entries'] for entry in nofEntriesMap.values()]), 'nof_events_pass' : sum([entry['nof_events_pass'] for entry in memEvtRangeDict.values()]), 'nof_int_pass' : sum([entry['nof_int_pass'] for entry in memEvtRangeDict.values()]), 'nof_zero' : sum([entry['nof_zero'] for entry in memEvtRangeDict.values()]), 'nof_jobs' : len(memEvtRangeDict), } if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addMEM) self.createScript_sbatch() logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_addMEM(lines_makefile) self.addToMakefile_hadd(lines_makefile) self.createMakefile(lines_makefile) ws_len = max([len(kk) + 1 for kk in statistics.keys()]) total_nof_integrations_sum = sum(x['nof_int'] for x in statistics.values()) total_nof_entires = sum(x['nof_entries'] for x in statistics.values()) total_nof_zero_int = sum(x['nof_zero'] for x in statistics.values()) total_nof_jobs = sum(x['nof_jobs'] for x in statistics.values()) total_nof_pass = sum(x['nof_events_pass'] for x in statistics.values()) total_nof_int_pass_avg = float(sum(x['nof_int_pass'] for x in statistics.values())) / total_nof_pass total_nof_integrations_avg = float(total_nof_integrations_sum) / total_nof_entires total_nof_int_per_job = float(total_nof_integrations_sum) / total_nof_jobs for k, v in statistics.iteritems(): if v['nof_entries'] == 0: int_per_event = 0. evt_pass = 0. else: int_per_event = float(v['nof_int']) / v['nof_entries'] evt_pass = (100 * float(v['nof_events_pass']) / v['nof_entries']) if v['nof_events_pass'] == 0: nof_int_pass = 0. else: nof_int_pass = float(v['nof_int_pass']) / v['nof_events_pass'] print('%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d (%.2f%%) evt pass; %.2f int/evt pass; %d evt 0int)' % (k, ' ' * (ws_len - len(k)), v['nof_int'], v['nof_entries'], v['nof_jobs'], int_per_event, v['nof_events_pass'], evt_pass, nof_int_pass, v['nof_zero'], ) ) print('%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d evt pass; %.2f int/evt pass; ' '%.2f int/job pass; %d evt 0int)' % ('total', ' ' * (ws_len - len('total')), total_nof_integrations_sum, total_nof_entires, total_nof_jobs, total_nof_integrations_avg, total_nof_pass, total_nof_int_pass_avg, total_nof_int_per_job, total_nof_zero_int, ) ) if self.max_mem_integrations > 0 and total_nof_integrations_sum > self.max_mem_integrations: logging.error("Will not start the jobs (max nof integrations exceeded)!") return False else: logging.info("Done") return True
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") logging.info("Building dictionaries for sample %s..." % process_name) for lepton_selection in self.lepton_selections: for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight", "forBDTtraining" ]: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) for chargeSumSelection in self.chargeSumSelections: central_or_shift_extensions = ["", "hadd", "addBackgrounds"] central_or_shift_dedicated = self.central_or_shifts if self.runTHweights(sample_info) else self.central_or_shifts_external central_or_shifts_extended = central_or_shift_extensions + central_or_shift_dedicated for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [ process_name, "hadd" ] for process_name_or_dummy in process_name_extended: if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]: continue if central_or_shift_or_dummy not in central_or_shift_extensions and not self.accept_systematics( central_or_shift_or_dummy, is_mc, lepton_selection, chargeSumSelection, sample_info ): continue key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, chargeSumSelection, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES, DKEY_SYNC ]: if dir_type == DKEY_SYNC and not self.do_sync: continue initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight, chargeSumSelection ]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight, chargeSumSelection ]), process_name_or_dummy) for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "prepareDatacards", "addSystFakeRates", "makePlots" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]: if dir_type == DKEY_SYNC and not self.do_sync: continue initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0; frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100*numDirectories_created >= frac*numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) mcClosure_regex = re.compile('Fakeable_mcClosure_(?P<type>m|e)_wFakeRateWeights') for lepton_selection in self.lepton_selections: electron_selection = lepton_selection muon_selection = lepton_selection hadTauVeto_selection = "Tight" hadTauVeto_selection = "|".join([ hadTauVeto_selection, self.hadTauVeto_selection_part2 ]) if lepton_selection == "forBDTtraining": electron_selection = "Loose" muon_selection = "Loose" elif lepton_selection == "Fakeable_mcClosure_e": electron_selection = "Fakeable" muon_selection = "Tight" elif lepton_selection == "Fakeable_mcClosure_m": electron_selection = "Tight" muon_selection = "Fakeable" for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight", "forBDTtraining" ]: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) for chargeSumSelection in self.chargeSumSelections: for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) inputFileList = inputFileLists[sample_name] sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") use_th_weights = self.runTHweights(sample_info) central_or_shift_dedicated = self.central_or_shifts if use_th_weights else self.central_or_shifts_external for central_or_shift in central_or_shift_dedicated: if not self.accept_systematics( central_or_shift, is_mc, lepton_selection, chargeSumSelection, sample_info ): continue central_or_shifts_local = [] if central_or_shift == "central" and not use_th_weights: for central_or_shift_local in self.central_or_shifts_internal: if self.accept_systematics( central_or_shift_local, is_mc, lepton_selection, chargeSumSelection, sample_info ): central_or_shifts_local.append(central_or_shift_local) logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift)) # build config files for executing analysis code key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, chargeSumSelection, central_or_shift) for jobId in inputFileList.keys(): analyze_job_tuple = (process_name, lepton_selection_and_frWeight, chargeSumSelection, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue syncOutput = '' syncTree = '' if self.do_sync: if chargeSumSelection != 'OS': continue mcClosure_match = mcClosure_regex.match(lepton_selection_and_frWeight) if lepton_selection_and_frWeight == 'Tight': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_SR.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_SR' % self.channel.replace('_', '') elif lepton_selection_and_frWeight == 'Fakeable_wFakeRateWeights': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Fake.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_Fake' % self.channel.replace('_', '') elif mcClosure_match: mcClosure_type = mcClosure_match.group('type') syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_mcClosure_%s.root' % (self.channel, central_or_shift, mcClosure_type)) syncTree = 'syncTree_%s_mcClosure_%s' % (self.channel.replace('_', ''), mcClosure_type) else: continue if syncTree and central_or_shift != "central": syncTree = os.path.join(central_or_shift, syncTree) syncRLE = '' if self.do_sync and self.rle_select: syncRLE = self.rle_select % syncTree if not os.path.isfile(syncRLE): logging.warning("Input RLE file for the sync is missing: %s; skipping the job" % syncRLE) continue if syncOutput: self.inputFiles_sync['sync'].append(syncOutput) cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%s_%i.root" % analyze_job_tuple) branchName_memOutput = '%s_%s' % (self.MEMbranch, self.get_addMEM_systematics(central_or_shift)) \ if self.MEMbranch else '' self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : cfgFile_modified_path, 'histogramFile' : histogramFile_path, 'logFile' : logFile_path, 'selEventsFileName_output' : rleOutputFile_path, 'electronSelection' : electron_selection, 'muonSelection' : muon_selection, 'apply_leptonGenMatching' : self.apply_leptonGenMatching, 'hadTauSelection' : hadTauVeto_selection, 'chargeSumSelection' : chargeSumSelection, 'applyFakeRateWeights' : self.applyFakeRateWeights if not lepton_selection == "Tight" else "disabled", 'central_or_shift' : central_or_shift, 'central_or_shifts_local' : central_or_shifts_local, 'selectBDT' : self.isBDTtraining, 'branchName_memOutput' : branchName_memOutput, 'syncOutput' : syncOutput, 'syncTree' : syncTree, 'syncRLE' : syncRLE, 'apply_hlt_filter' : self.hlt_filter, 'useNonNominal' : self.use_nonnominal, 'fillGenEvtHistograms' : True, 'isControlRegion' : self.isControlRegion, } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight, chargeSumSelection) hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight, chargeSumSelection) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s_%s.root" % hadd_stage1_job_tuple) if self.isBDTtraining or self.do_sync: continue # add output files of hadd_stage1 for data to list of input files for hadd_stage1_5 key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight, chargeSumSelection) key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight, chargeSumSelection) hadd_stage1_5_job_tuple = (lepton_selection_and_frWeight, chargeSumSelection) key_hadd_stage1_5_job = getKey(*hadd_stage1_5_job_tuple) if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST], "hadd_stage1_5_%s_%s.root" % hadd_stage1_5_job_tuple) if self.isBDTtraining or self.do_sync: continue ## doing list of processes to make the hadd in _Convs and _fake ## we could remove the tH ones with althernative couplings sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend(self.ttHProcs) processes_input_base = self.get_processes_input_base(sample_categories) # sum fake background contributions for the total of all MC sample # input processes: TT_fake, TTW_fake, TTWW_fake, ... # output process: fakes_mc key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, chargeSumSelection) key_addBackgrounds_dir = getKey("addBackgrounds") addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight, chargeSumSelection) key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple) processes_input = [] for process_input_base in processes_input_base: if "HH" in process_input_base: continue processes_input.append("%s_fake" % process_input_base) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_fakes_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_fakes_tuple), 'categories' : [ getHistogramDir(self.channel, lepton_selection, lepton_frWeight, chargeSumSelection) ], 'processes_input' : processes_input, 'process_output' : "fakes_mc" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]) # sum conversion background contributions for the total of all MC sample # input processes: TT_Convs, TTW_Convs, TTWW_Convs, ... # output process: Convs addBackgrounds_job_Convs_tuple = ("Convs", lepton_selection_and_frWeight, chargeSumSelection) key_addBackgrounds_job_Convs = getKey(*addBackgrounds_job_Convs_tuple) processes_input = [] for process_input_base in self.convs_backgrounds: if "HH" in process_input_base: continue processes_input.append("%s_Convs" % process_input_base) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_Convs_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_Convs_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_Convs_tuple), 'categories' : [ getHistogramDir(self.channel, lepton_selection, lepton_frWeight, chargeSumSelection) ], 'processes_input' : processes_input, 'process_output' : "Convs" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs]) # initialize input and output file names for hadd_stage2 key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, chargeSumSelection) key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight, chargeSumSelection) hadd_stage2_job_tuple = (lepton_selection_and_frWeight, chargeSumSelection) key_hadd_stage2_job = getKey(*hadd_stage2_job_tuple) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] if lepton_selection == "Tight": self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job]) self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s_%s.root" % hadd_stage2_job_tuple) if self.isBDTtraining or self.do_sync: if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) if self.isBDTtraining: self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) elif self.do_sync: self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] if self.isBDTtraining: self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) elif self.do_sync: self.addToMakefile_syncNtuple(lines_makefile) outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel) self.outputFile_sync['sync'] = outputFile_sync_path self.addToMakefile_hadd_sync(lines_makefile) else: raise ValueError("Internal logic error") self.addToMakefile_validate(lines_makefile) self.targets.extend(self.phoniesToAdd) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs logging.info("Creating configuration files to run 'addBackgroundFakes'") for chargeSumSelection in self.chargeSumSelections: key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled"), chargeSumSelection) key_addFakes_dir = getKey("addBackgroundLeptonFakes") key_addFakes_job = getKey("data_fakes", chargeSumSelection) category_sideband = "{}_{}_Fakeable_wFakeRateWeights".format(self.channel, chargeSumSelection) self.jobOptions_addFakes[key_addFakes_job] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addFakes_dir][DKEY_CFGS], "addBackgroundLeptonFakes_%s_cfg.py" % chargeSumSelection), 'outputFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_HIST], "addBackgroundLeptonFakes_%s.root" % chargeSumSelection), 'logFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_LOGS], "addBackgroundLeptonFakes_%s.log" % chargeSumSelection), 'category_signal' : "{}_{}_Tight".format(self.channel, chargeSumSelection), 'category_sideband' : category_sideband } self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job]) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), chargeSumSelection) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile']) logging.info("Creating configuration files to run 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: key_prep_dcard_dir = getKey("prepareDatacards") if "OS" in self.chargeSumSelections: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") prep_dcard_job_tuple = (self.channel, "OS", histogramToFit) key_prep_dcard_job = getKey("OS", histogramToFit) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s_%s.root" % prep_dcard_job_tuple), 'histogramDir' : self.histogramDir_prep_dcard, 'histogramToFit' : histogramToFit, 'label' : None } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) if "SS" in self.chargeSumSelections: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS") prep_dcard_job_tuple = (self.channel, "SS", histogramToFit) key_prep_dcard_job = getKey("SS", histogramToFit) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s_%s.root" % prep_dcard_job_tuple), 'histogramDir' : self.histogramDir_prep_dcard_SS, 'histogramToFit' : histogramToFit, 'label' : 'SS' } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) # add shape templates for the following systematic uncertainties: # - 'CMS_ttHl_Clos_norm_e' # - 'CMS_ttHl_Clos_shape_e' # - 'CMS_ttHl_Clos_norm_m' # - 'CMS_ttHl_Clos_shape_m' for chargeSumSelection in self.chargeSumSelections: key_prep_dcard_job = getKey(chargeSumSelection, histogramToFit) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), chargeSumSelection) key_add_syst_fakerate_dir = getKey("addSystFakeRates") add_syst_fakerate_job_tuple = (self.channel, chargeSumSelection, histogramToFit) key_add_syst_fakerate_job = getKey(chargeSumSelection, histogramToFit) self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = { 'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'], 'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_%s_cfg.py" % add_syst_fakerate_job_tuple), 'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s_%s.root" % add_syst_fakerate_job_tuple), 'category' : self.channel, 'histogramToFit' : histogramToFit, 'plots_outputFileName' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_PLOT], "addSystFakeRates.png") } histogramDir_nominal = None if chargeSumSelection == "OS": histogramDir_nominal = self.histogramDir_prep_dcard elif chargeSumSelection == "SS": histogramDir_nominal = self.histogramDir_prep_dcard_SS else: raise ValueError("Invalid parameter 'chargeSumSelection' = %s !!" % chargeSumSelection) for lepton_type in [ 'e', 'm' ]: lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type if lepton_mcClosure not in self.lepton_selections: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled") key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight, chargeSumSelection) histogramDir_mcClosure = self.mcClosure_dir['%s_%s' % (lepton_mcClosure, chargeSumSelection)] self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({ 'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections, 'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit), 'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'], 'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit) }) self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job]) logging.info("Creating configuration files to run 'makePlots'") key_makePlots_dir = getKey("makePlots") if "OS" in self.chargeSumSelections: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") key_makePlots_job = getKey("OS") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel), 'histogramDir' : self.histogramDir_prep_dcard, 'label' : self.channel, 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) if "SS" in self.chargeSumSelections: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS") key_makePlots_job = getKey("SS") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_SS_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_SS.png" % self.channel), 'histogramDir' : self.histogramDir_prep_dcard_SS, 'label' : "{} SS".format(self.channel), 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) if "Fakeable_mcClosure" in self.lepton_selections: #TODO key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") key_makePlots_job = getKey("OS") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots_mcClosure, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_mcClosure_%s_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_mcClosure_%s.png" % self.channel) } self.createCfg_makePlots_mcClosure(self.jobOptions_make_plots[key_makePlots_job]) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel) self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel) self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes) self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_add_syst_fakerate(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.addToMakefile_validate(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def submitJob( self, inputFiles, executable, command_line_parameter, outputFilePath, outputFiles, scriptFile, logFile=None, skipIfOutputFileExists=False, job_template_file='sbatch-node.sh.template', copy_output_file=True, nof_submissions=0, ): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ logging.debug("<sbatchManager::submitJob>: job_template_file = '%s'" % job_template_file) job_template_file = os.path.join(jinja_template_dir, job_template_file) job_template = open(job_template_file, 'r').read() # raise if logfile missing if not logFile: if not self.logFileDir: raise ValueError( "Please call 'setLogFileDir' before calling 'submitJob' !!" ) logFile = os.path.join( self.logFileDir, os.path.basename(scriptFile).replace(".sh", ".log")) # skip only if none of the output files are missing in the file system outputFiles_fullpath = map( lambda outputFile: os.path.join(outputFilePath, outputFile), outputFiles) if skipIfOutputFileExists: outputFiles_missing = [ outputFile for outputFile in outputFiles_fullpath \ if not is_file_ok(outputFile, validate_outputs = True, min_file_size = self.min_file_size) ] if not outputFiles_missing: logging.debug( "output file(s) = %s exist(s) --> skipping !!" % \ '; '.join(map(lambda x: "'%s'" % x, outputFiles_fullpath)) ) return if not self.workingDir: raise ValueError( "Please call 'setWorkingDir' before calling 'submitJob' !!") if not self.cmssw_base_dir: logging.warning("cmssw_base_dir not set, setting it to '%s'" % os.environ.get('CMSSW_BASE')) self.cmssw_base_dir = os.environ.get('CMSSW_BASE') job_dir = self.get_job_dir() # create script for executing jobs wrapper_log_file = logFile.replace('.log', '_wrapper.log') executable_log_file = logFile.replace('.log', '_executable.log') wrapper_log_file, executable_log_file = get_log_version( (wrapper_log_file, executable_log_file)) sbatch_command = "sbatch --partition={partition} --output={output} --comment='{comment}' " \ "{max_mem} {args} {cmd}".format( partition = self.queue, output = wrapper_log_file, comment = self.pool_id, args = self.sbatchArgs, cmd = scriptFile, max_mem = '--mem={}'.format(self.max_mem) if self.max_mem else '', ) two_pow_sixteen = 65536 random.seed((abs(hash(command_line_parameter))) % two_pow_sixteen) max_delay = 60 random_delay = random.randint(0, max_delay) script = jinja2.Template(job_template).render( working_dir=self.workingDir, cmssw_base_dir=self.cmssw_base_dir, job_dir=job_dir, job_template_file=job_template_file, exec_name=executable, command_line_parameter=command_line_parameter, inputFiles=" ".join(inputFiles), outputDir=outputFilePath, outputFiles=" ".join(outputFiles), wrapper_log_file=wrapper_log_file, executable_log_file=executable_log_file, script_file=scriptFile, RUNNING_COMMAND=sbatch_command, random_sleep=random_delay, copy_output_file=copy_output_file, ) logging.debug("writing sbatch script file = '%s'" % scriptFile) with codecs.open(scriptFile, "w", "utf-8") as f: f.write(script) f.flush() os.fsync(f.fileno()) if self.dry_run: return nof_submissions += 1 job = { 'sbatch_command': sbatch_command, 'status': Status.in_queue, 'log_wrap': wrapper_log_file, 'log_exec': executable_log_file, 'args': ( inputFiles, executable, command_line_parameter, outputFilePath, outputFiles, scriptFile, logFile, skipIfOutputFileExists, job_template_file, nof_submissions, ), 'nof_submissions': nof_submissions, 'outputFiles': outputFiles_fullpath, } self.queuedJobs.append(job)
def __init__(self, configDir, outputDir, executable_analyze, cfgFile_analyze, samples, hadTauVeto_selection, applyFakeRateWeights, central_or_shifts, max_files_per_job, era, use_lumi, lumi, check_output_files, running_method, num_parallel_jobs, executable_addBackgrounds, executable_addBackgroundJetToTauFakes, histograms_to_fit, select_rle_output = False, executable_prep_dcard = "prepareDatacards", executable_add_syst_dcard = "addSystDatacards", verbose = False, hlt_filter = False, dry_run = False, isDebug = False, use_home = True, do_sync = False, rle_select = '', use_nonnominal = False, ): analyzeConfig.__init__(self, configDir = configDir, outputDir = outputDir, executable_analyze = executable_analyze, channel = "ttZctrl", samples = samples, central_or_shifts = central_or_shifts, max_files_per_job = max_files_per_job, era = era, use_lumi = use_lumi, lumi = lumi, check_output_files = check_output_files, running_method = running_method, num_parallel_jobs = num_parallel_jobs, histograms_to_fit = histograms_to_fit, triggers = [ '1e', '1mu', '2e', '2mu', '1e1mu' ], executable_prep_dcard = executable_prep_dcard, executable_add_syst_dcard = executable_add_syst_dcard, verbose = verbose, dry_run = dry_run, isDebug = isDebug, use_home = use_home, do_sync = do_sync, ) self.lepton_selections = [ "Tight", "Fakeable" ] self.lepton_frWeights = [ "enabled", "disabled" ] self.hadTauVeto_selection_part2 = hadTauVeto_selection self.applyFakeRateWeights = applyFakeRateWeights run_mcClosure = 'central' not in self.central_or_shifts or len(central_or_shifts) > 1 or self.do_sync if self.era != '2017': logging.warning('mcClosure for lepton FR not possible for era %s' % self.era) run_mcClosure = False if run_mcClosure: # Run MC closure jobs only if the analysis is run w/ (at least some) systematic uncertainties # self.lepton_and_hadTau_selections.extend([ "Fakeable_mcClosure_all" ]) #TODO pass self.lepton_genMatches = [ "3l0g0j", "2l1g0j", "2l0g1j", "1l2g0j", "1l1g1j", "1l0g2j", "0l3g0j", "0l2g1j", "0l1g2j", "0l0g3j" ] self.apply_leptonGenMatching = None self.lepton_genMatches_nonfakes = [] self.lepton_genMatches_conversions = [] self.lepton_genMatches_fakes = [] if applyFakeRateWeights == "3lepton": self.apply_leptonGenMatching = True for lepton_genMatch in self.lepton_genMatches: if lepton_genMatch.endswith("0g0j"): self.lepton_genMatches_nonfakes.append(lepton_genMatch) elif lepton_genMatch.endswith("0j"): self.lepton_genMatches_conversions.append(lepton_genMatch) else: self.lepton_genMatches_fakes.append(lepton_genMatch) if run_mcClosure: self.lepton_selections.extend([ "Fakeable_mcClosure_e", "Fakeable_mcClosure_m" ]) else: raise ValueError("Invalid Configuration parameter 'applyFakeRateWeights' = %s !!" % applyFakeRateWeights) self.executable_addBackgrounds = executable_addBackgrounds self.executable_addFakes = executable_addBackgroundJetToTauFakes self.nonfake_backgrounds = [ "TT", "TTW", "TTZ", "TTWW", "EWK", "Rares", "tHq", "tHW", "VH" ] self.cfgFile_analyze = os.path.join(self.template_dir, cfgFile_analyze) self.prep_dcard_processesToCopy = [ "data_obs" ] + self.nonfake_backgrounds + [ "conversions", "fakes_data", "fakes_mc" ] self.histogramDir_prep_dcard = "ttZctrl_Tight" self.make_plots_backgrounds = [ "TTW", "TTZ", "TTWW", "EWK", "Rares", "tHq", "tHW" ] + [ "conversions", "fakes_data" ] self.cfgFile_make_plots = os.path.join(self.template_dir, "makePlots_ttZctrl_cfg.py") self.make_plots_signal = "TTZ" self.select_rle_output = select_rle_output self.rle_select = rle_select self.use_nonnominal = use_nonnominal self.hlt_filter = hlt_filter
def poll(self, nonBlocking): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ text_line = '-' * 120 # Set a delimiter, which distinguishes entries b/w different jobs delimiter = ',' # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length): # 1) squeue -h -u {{user}} -o '%i %256k' # Collects the list of running jobs # a) -h omits header # b) -u {{user}} looks only for jobs submitted by {{user}} # c) -o '%i %256k' specifies the output format # i) %i -- job ID (1st column) # ii) %256k -- comment with width of 256 characters (2nd column) # If the job has no comments, the entry simply reads (null) # 2) grep {{comment}} # Filter the jobs by the comment which must be unique per sbatchManager instance at all times # 3) awk '{print $1}' # Filter only the jobIds out # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g' # Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read) command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \ "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'" command = jinja2.Template(command_template).render( user=self.user, pool_id_length=self.max_pool_id_length, comment=self.pool_id, delimiter=delimiter) # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes # even if some of them have already finished jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) while nofJobs_left > 0: # Get the list of jobs submitted to batch system and convert their jobIds to a set poll_result, poll_result_err = '', '' while True: poll_result, poll_result_err = run_cmd(command, do_not_log=False, return_stderr=True) if not poll_result and poll_result_err: logging.warning( 'squeue caught an error: {squeue_error}'.format( squeue_error=poll_result_err)) else: break # sleep a minute and then try again # in principle we could limit the number of retries, but hopefully that's not necessary logging.debug("sleeping for %i seconds." % 60) time.sleep(60) polled_ids = set() if poll_result != '': polled_ids = set(poll_result.split(delimiter)) # Check if number of jobs submitted to batch system is below maxSubmittedJobs; # if it is, take jobs from queuedJobs list and submit them, # until a total of maxSubmittedJobs is submitted to batch system nofJobs_toSubmit = min(len(self.queuedJobs), self.maxSubmittedJobs - len(polled_ids)) if nofJobs_toSubmit > 0: logging.debug( "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs." .format(len(polled_ids), len(self.queuedJobs), nofJobs_toSubmit)) else: logging.debug( "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing." .format(len(polled_ids), len(self.queuedJobs))) for i in range(0, nofJobs_toSubmit): # randomly submit a job from the queue two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_idx = len(self.queuedJobs) - 1 random_idx = random.randint(0, max_idx) job = self.queuedJobs.pop(random_idx) job['status'] = Status.submitted job_id = self.submit(job['sbatch_command']) self.submittedJobs[job_id] = job # Now check status of jobs submitted to batch system: # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of # jobs that have finished already finished_ids = list(jobIds_set - polled_ids) # Do not poll anything if currently there are no finished jobs if finished_ids: # Based on job's exit code what if the job has failed or completed successfully # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here # Therefore, we want to restrict the output by grepping specific job IDs # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable, # which is of order 2e6 # This means that we have to split the job IDs into chunks each of which we have to check separately finished_ids_chunks = [ finished_ids[i:i + self.max_nof_greps] for i in range(0, len(finished_ids), self.max_nof_greps) ] for finished_ids_chunk in finished_ids_chunks: completion = self.check_job_completion(finished_ids_chunk) completed_jobs, running_jobs, failed_jobs = [], [], [] for job_id, details in completion.iteritems(): if details.status == Status.completed: completed_jobs.append(job_id) elif details.status == Status.running: running_jobs.append(job_id) else: failed_jobs.append(job_id) # If there are any failed jobs, throw if failed_jobs: failed_jobs_str = ','.join(failed_jobs) errors = [ completion[job_id].status for job_id in failed_jobs ] logging.error( "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}" .format( jobIds=failed_jobs_str, reasons=', '.join(map(Status.toString, errors)), )) # Let's print a table where the first column corresponds to the job ID # and the second column lists the exit code, the derived exit code, the status # and the classification of the failed job logging.error("Error table:") for job_id in failed_jobs: sys.stderr.write( "{jobId} {exitCode} {derivedExitCode} {state} {status}\n" .format( jobId=job_id, exitCode=completion[job_id].exit_code, derivedExitCode=completion[job_id]. derived_exit_code, state=completion[job_id].state, status=Status.toString( completion[job_id].status), )) sys.stderr.write('%s\n' % text_line) for failed_job in failed_jobs: for log in zip(['wrapper', 'executable'], ['log_wrap', 'log_exec']): logfile = self.submittedJobs[failed_job][ log[1]] if os.path.isfile(logfile): logfile_contents = open(logfile, 'r').read() else: logfile_contents = '<file is missing>' sys.stderr.write( 'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n' .format( id=failed_job, description=log[0], path=logfile, log=logfile_contents, line=text_line, )) if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \ completion[failed_job].status == Status.io_error: # The job is eligible for resubmission if the job hasn't been resubmitted more # than a preset limit of resubmissions AND if the job failed due to I/O errors logging.warning( "Job w/ ID {id} and arguments {args} FAILED because: {reason} " "-> resubmission attempt #{attempt}". format( id=failed_job, args=self.submittedJobs[failed_job] ['args'], reason=Status.toString( completion[failed_job].status), attempt=self.submittedJobs[failed_job] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[failed_job]['args']) # The old ID must be deleted, b/c otherwise it would be used to compare against # squeue output and we would resubmit the failed job ad infinitum del self.submittedJobs[failed_job] else: # We've exceeded the maximum number of resubmissions -> fail the workflow raise Status.raiseError( completion[failed_job].status) else: logging.debug( "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}" .format( completedIds=','.join(completed_jobs), runningInfo='(%s still running)' % ','.join(running_jobs) if running_jobs else '', )) # Mark successfully finished jobs as completed so that won't request their status code again # Otherwise they will be still at ,,submitted'' state for job_id in completed_jobs: if not all( map( lambda outputFile: is_file_ok( outputFile, validate_outputs=True, min_file_size=self.min_file_size), self .submittedJobs[job_id]['outputFiles'])): if self.submittedJobs[job_id][ 'nof_submissions'] < self.max_resubmissions: logging.warning( "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file " "-> resubmission attempt #{attempt}". format( id=job_id, args=self.submittedJobs[job_id] ['args'], attempt=self.submittedJobs[job_id] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[job_id]['args']) del self.submittedJobs[job_id] else: raise ValueError( "Job w/ ID {id} FAILED because it repeatedly produces bogus output " "file {output} yet the job still exits w/o any errors" .format( id=job_id, output=', '.join( self.submittedJobs[job_id] ['outputFiles']), )) else: # Job completed just fine self.submittedJobs[job_id][ 'status'] = Status.completed jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) logging.info( "Waiting for sbatch to finish (%d job(s) still left) ..." % nofJobs_left) if nofJobs_left > 0: if nonBlocking: return False two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_delay = 300 random_delay = random.randint(0, max_delay) logging.debug("sleeping for %i seconds." % random_delay) time.sleep(self.poll_interval + random_delay) else: break return True
def create(self): """Creates all necessary config files and runs the Ntuple production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") if is_mc and process_name not in self.pileup_histograms: raise ValueError("Missing PU distribution for %s in file %s" % (process_name, self.pileup)) logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable, process_name)) inputFileList = generateInputFileList(sample_info, self.max_files_per_job) key_dir = getKey(sample_name) subDirs = list( map( lambda y: os.path.join(self.dirs[key_dir][DKEY_NTUPLES], '%04d' % y), set(map(lambda x: x // 1000, inputFileList.keys())))) for subDir in subDirs: create_if_not_exists(subDir) for jobId in inputFileList.keys(): key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = inputFileList[jobId] if len(self.inputFiles[key_file]) == 0: logging.warning( "ntupleFiles['%s'] = %s --> skipping job !!" % (key_file, self.inputFiles[key_file])) continue self.cfgFiles_prodNtuple_modified[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "produceNtuple_%s_%i_cfg.py" % (process_name, jobId)) self.outputFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_NTUPLES], "%04d" % (jobId // 1000), "tree_%i.root" % jobId) self.logFiles_prodNtuple[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "produceNtuple_%s_%i.log" % (process_name, jobId)) hlt_paths = sample_info["hlt_paths"] if not is_mc else [] hlt_cuts = list( Triggers(self.era).triggers_flat ) if self.preselection_cuts["applyHLTcut"] else [] jobOptions = { 'inputFiles': self.inputFiles[key_file], 'cfgFile_modified': self.cfgFiles_prodNtuple_modified[key_file], 'outputFile': self.outputFiles[key_file], 'is_mc': is_mc, 'random_seed': jobId, 'process_name': process_name, 'category_name': sample_info["sample_category"], 'triggers': hlt_paths, 'HLTcuts': hlt_cuts, } self.createCfg_prodNtuple(jobOptions) num_jobs = 0 if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable) num_jobs = self.createScript_sbatch() logging.info("Generated %i job(s)" % num_jobs) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_prodNtuple(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return num_jobs
sample_name=sample_name, )) output_dir_parent = os.path.join(output_dir, sample_name) if not os.path.isdir(output_dir_parent): os.makedirs(output_dir_parent) for sample_subdir_basename in os.listdir(sample_path): sample_subdir = os.path.join(sample_path, sample_subdir_basename) for rootfile_basename in os.listdir(sample_subdir): rootfile = os.path.join(sample_subdir, rootfile_basename) if not os.path.isfile(rootfile): continue logging.debug( "Dumping RLE numbers for file '{rootfile_name}'". format(rootfile_name=rootfile, )) rootfile_idx = idx(rootfile_basename) outfile_idx = os.path.join( output_dir_parent, "{i}.txt".format(i=rootfile_idx)) if os.path.isfile(outfile_idx): logging.warning( "Whoops, file already exists; skipping that") continue dump_rle(rootfile, outfile_idx, args.tree, args.run, args.lumi, args.event) logging.debug("Done!")
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") logging.info("Building dictionaries for sample %s..." % process_name) for lepton_selection in self.lepton_selections: for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue if lepton_frWeight == "disabled" and not lepton_selection in ["Tight"]: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) for lepton_charge_selection in self.lepton_charge_selections: if 'mcClosure' in lepton_selection and lepton_charge_selection != 'SS': # Run MC closure only for the region that complements the SR continue central_or_shift_extensions = ["", "hadd", "addBackgrounds"] central_or_shift_dedicated = self.central_or_shifts if self.runTHweights(sample_info) else self.central_or_shifts_external central_or_shifts_extended = central_or_shift_extensions + central_or_shift_dedicated for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [ process_name, "hadd" ] for process_name_or_dummy in process_name_extended: if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]: continue if central_or_shift_or_dummy not in central_or_shift_extensions and not self.accept_systematics( central_or_shift_or_dummy, is_mc, lepton_selection, lepton_charge_selection, sample_info ): continue key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, lepton_charge_selection, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES, DKEY_SYNC ]: if dir_type == DKEY_SYNC and not self.do_sync: continue initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight, lepton_charge_selection ]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight, lepton_charge_selection ]), process_name_or_dummy) for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "addBackgroundLeptonFlips", "prepareDatacards", "addSystFakeRates", "makePlots" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]: if dir_type == DKEY_SYNC and not self.do_sync: continue initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0; frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100*numDirectories_created >= frac*numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) mcClosure_regex = re.compile('Fakeable_mcClosure_(?P<type>m|e)_wFakeRateWeights') for lepton_selection in self.lepton_selections: electron_selection = lepton_selection muon_selection = lepton_selection hadTauVeto_selection = "Tight" hadTauVeto_selection = "|".join([ hadTauVeto_selection, self.hadTauVeto_selection_part2 ]) if lepton_selection == "Fakeable_mcClosure_e": electron_selection = "Fakeable" muon_selection = "Tight" elif lepton_selection == "Fakeable_mcClosure_m": electron_selection = "Tight" muon_selection = "Fakeable" for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight" ]: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) for lepton_charge_selection in self.lepton_charge_selections: if 'mcClosure' in lepton_selection and lepton_charge_selection != 'SS': # Run MC closure only for the region that complements the SR continue for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) inputFileList = inputFileLists[sample_name] sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") use_th_weights = self.runTHweights(sample_info) central_or_shift_dedicated = self.central_or_shifts if use_th_weights else self.central_or_shifts_external for central_or_shift in central_or_shift_dedicated: if not self.accept_systematics( central_or_shift, is_mc, lepton_selection, lepton_charge_selection, sample_info ): continue central_or_shifts_local = [] if central_or_shift == "central" and not use_th_weights: for central_or_shift_local in self.central_or_shifts_internal: if self.accept_systematics( central_or_shift_local, is_mc, lepton_selection, lepton_charge_selection, sample_info ): central_or_shifts_local.append(central_or_shift_local) logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift)) # build config files for executing analysis code key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection, central_or_shift) for jobId in inputFileList.keys(): analyze_job_tuple = (process_name, lepton_selection_and_frWeight, lepton_charge_selection, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue syncOutput = '' syncTree = '' syncGenMatch = self.lepton_genMatches_nonfakes if self.do_sync: mcClosure_match = mcClosure_regex.match(lepton_selection_and_frWeight) if lepton_selection_and_frWeight == 'Tight': if lepton_charge_selection == 'SS': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_SR.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_SR' % self.channel elif lepton_charge_selection == 'OS': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Flip.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_Flip' % self.channel else: continue elif lepton_selection_and_frWeight == 'Fakeable_wFakeRateWeights' and lepton_charge_selection == 'SS': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Fake.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_Fake' % self.channel elif mcClosure_match and lepton_charge_selection == 'SS': mcClosure_type = mcClosure_match.group('type') syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_mcClosure_%s.root' % (self.channel, central_or_shift, mcClosure_type)) syncTree = 'syncTree_%s_mcClosure_%s' % (self.channel, mcClosure_type) else: continue if syncTree and central_or_shift != "central": syncTree = os.path.join(central_or_shift, syncTree) syncRLE = '' if self.do_sync and self.rle_select: syncRLE = self.rle_select % syncTree if not os.path.isfile(syncRLE): logging.warning("Input RLE file for the sync is missing: %s; skipping the job" % syncRLE) continue if syncOutput: self.inputFiles_sync['sync'].append(syncOutput) cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%s_%i.root" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : cfgFile_modified_path, 'histogramFile' : histogramFile_path, 'logFile' : logFile_path, 'selEventsFileName_output' : rleOutputFile_path, 'electronSelection' : electron_selection, 'muonSelection' : muon_selection, 'apply_leptonGenMatching' : self.apply_leptonGenMatching, 'leptonChargeSelection' : lepton_charge_selection, 'hadTauSelection_veto' : hadTauVeto_selection, 'applyFakeRateWeights' : self.applyFakeRateWeights if not lepton_selection == "Tight" else "disabled", 'central_or_shift' : central_or_shift, 'central_or_shifts_local' : central_or_shifts_local, 'syncOutput' : syncOutput, 'syncTree' : syncTree, 'syncRLE' : syncRLE, 'useNonNominal' : self.use_nonnominal, 'apply_hlt_filter' : self.hlt_filter, 'syncGenMatch' : syncGenMatch, } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection) hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight, lepton_charge_selection) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s_%s.root" % hadd_stage1_job_tuple) if self.do_sync: continue if is_mc: logging.info("Creating configuration files to run 'addBackgrounds' for sample %s" % process_name) sample_categories = [ sample_category ] for sample_category in sample_categories: # sum non-fake and fake contributions for each MC sample separately genMatch_categories = [ "nonfake", "Convs", "fake", "flip" ] for genMatch_category in genMatch_categories: key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection) key_addBackgrounds_dir = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection, "addBackgrounds") addBackgrounds_job_tuple = None processes_input = None process_output = None if genMatch_category == "nonfake": # sum non-fake contributions for each MC sample separately # input processes: TT2l0g0j; ... # output processes: TT; ... if sample_category in self.ttHProcs: lepton_genMatches = [] lepton_genMatches.extend(self.lepton_genMatches_nonfakes) lepton_genMatches.extend(self.lepton_genMatches_Convs) processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in lepton_genMatches ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_nonfakes ] process_output = sample_category addBackgrounds_job_tuple = (process_name, sample_category, lepton_selection_and_frWeight, lepton_charge_selection) elif genMatch_category == "Convs": # sum conversion background contributions for each MC sample separately # input processes: TT1l1g0j, TT0l2g0j; ... # output processes: TT_Convs; ... if sample_category in self.ttHProcs: processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_Convs ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_Convs ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_Convs ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_Convs ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_Convs ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_Convs ] process_output = "%s_Convs" % sample_category addBackgrounds_job_tuple = (process_name, "%s_Convs" % sample_category, lepton_selection_and_frWeight, lepton_charge_selection) elif genMatch_category == "fake": # sum fake contributions for each MC sample separately # input processes: TT1l0g1j, TT0l1g1j, TT0l0g2j; ... # output processes: TT_fake; ... if sample_category in self.ttHProcs: processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_fakes ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ] process_output = "%s_fake" % sample_category addBackgrounds_job_tuple = (process_name, "%s_fake" % sample_category, lepton_selection_and_frWeight, lepton_charge_selection) elif genMatch_category == "flip": # sum flip contributions for each MC sample separately # input processes: TT2l2f0g0j&2t0e0m0j, TT2l1f0g0j&2t0e0m0j; ... # output processes: TT_flip; ... if sample_category in self.ttHProcs: processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_flips ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_flips ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_flips ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_flips ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_flips ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_flips ] process_output = "%s_flip" % sample_category addBackgrounds_job_tuple = (process_name, "%s_flip" % sample_category, lepton_selection_and_frWeight, lepton_charge_selection) if processes_input: logging.info(" ...for genMatch option = '%s'" % genMatch_category) key_addBackgrounds_job = getKey(*addBackgrounds_job_tuple) cfgFile_modified = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_%s_cfg.py" % addBackgrounds_job_tuple) outputFile = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s_%s.root" % addBackgrounds_job_tuple) self.jobOptions_addBackgrounds[key_addBackgrounds_job] = { 'inputFile' : self.outputFile_hadd_stage1[key_hadd_stage1_job], 'cfgFile_modified' : cfgFile_modified, 'outputFile' : outputFile, 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(cfgFile_modified).replace("_cfg.py", ".log")), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, lepton_charge_selection) ], 'processes_input' : processes_input, 'process_output' : process_output } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds[key_addBackgrounds_job]) # initialize input and output file names for hadd_stage1_5 key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight, lepton_charge_selection) hadd_stage1_5_job_tuple = (lepton_selection_and_frWeight, lepton_charge_selection) key_hadd_stage1_5_job = getKey(*hadd_stage1_5_job_tuple) if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.jobOptions_addBackgrounds[key_addBackgrounds_job]['outputFile']) self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST], "hadd_stage1_5_%s_%s.root" % hadd_stage1_5_job_tuple) # add output files of hadd_stage1 for data to list of input files for hadd_stage1_5 if not is_mc: key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection) key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, lepton_charge_selection) if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job]) if self.do_sync: continue # sum fake background contributions for the total of all MC samples # input processes: TT1l0g1j, TT0l1g1j, TT0l0g2j; ... # output process: fakes_mc key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, lepton_charge_selection) key_addBackgrounds_dir = getKey("addBackgrounds") addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight, lepton_charge_selection) key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend(self.ttHProcs) processes_input = [] for sample_category in sample_categories: processes_input.append("%s_fake" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_fakes_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_fakes_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, lepton_charge_selection) ], 'processes_input' : processes_input, 'process_output' : "fakes_mc" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]) # sum flip background contributions for the total of all MC sample # input processes: TT2l1f0g0j,TT2l2f0g0j; ... # output process: flips_mc addBackgrounds_job_flips_tuple = ("flips_mc", lepton_selection_and_frWeight, lepton_charge_selection) key_addBackgrounds_job_flips = getKey(*addBackgrounds_job_flips_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend(self.ttHProcs) processes_input = [] for sample_category in sample_categories: processes_input.append("%s_flip" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_flips] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_flips_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_flips_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_flips_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, lepton_charge_selection) ], 'processes_input' : processes_input, 'process_output' : "flips_mc" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_flips]) # sum conversion background contributions for the total of all MC samples # input processes: TT1l1g0j, TT0l2g0j; ... # output process: Convs addBackgrounds_job_Convs_tuple = ("Convs", lepton_selection_and_frWeight, lepton_charge_selection) key_addBackgrounds_job_Convs = getKey(*addBackgrounds_job_Convs_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend(self.ttHProcs) processes_input = [] for sample_category in self.convs_backgrounds: processes_input.append("%s_Convs" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_Convs_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_Convs_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_Convs_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, lepton_charge_selection) ], 'processes_input' : processes_input, 'process_output' : "Convs" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs]) # initialize input and output file names for hadd_stage2 key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, lepton_charge_selection) key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight, lepton_charge_selection) hadd_stage2_job_tuple = (lepton_selection_and_frWeight, lepton_charge_selection) key_hadd_stage2_job = getKey(*hadd_stage2_job_tuple) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] if lepton_selection == "Tight": self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_flips]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job]) self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s_%s.root" % hadd_stage2_job_tuple) if self.do_sync: if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_syncNtuple(lines_makefile) outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel) self.outputFile_sync['sync'] = outputFile_sync_path self.addToMakefile_hadd_sync(lines_makefile) self.addToMakefile_validate(lines_makefile) self.targets.extend(self.phoniesToAdd) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs logging.info("Creating configuration files to run 'addBackgroundFakes'") for lepton_charge_selection in self.lepton_charge_selections: key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled"), lepton_charge_selection) key_addFakes_dir = getKey("addBackgroundLeptonFakes") addFakes_job_tuple = (lepton_charge_selection) key_addFakes_job = getKey("data_fakes", lepton_charge_selection) category_sideband = None if self.applyFakeRateWeights == "2lepton": category_sideband = "ttWctrl_%s_Fakeable_wFakeRateWeights" % lepton_charge_selection else: raise ValueError("Invalid Configuration parameter 'applyFakeRateWeights' = %s !!" % self.applyFakeRateWeights) self.jobOptions_addFakes[key_addFakes_job] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addFakes_dir][DKEY_CFGS], "addBackgroundLeptonFakes_%s_cfg.py" % addFakes_job_tuple), 'outputFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_HIST], "addBackgroundLeptonFakes_%s.root" % addFakes_job_tuple), 'logFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_LOGS], "addBackgroundLeptonFakes_%s.log" % addFakes_job_tuple), 'category_signal' : "ttWctrl_%s_Tight" % lepton_charge_selection, 'category_sideband' : category_sideband } self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job]) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), lepton_charge_selection) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile']) #-------------------------------------------------------------------------- # CV: add histograms in OS and SS regions, # so that "data_fakes" background can be subtracted from OS control region used to estimate charge flip background key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") key_addFakes_job = getKey("data_fakes", "OS") key_hadd_stage1_6_dir = getKey("hadd", get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") key_hadd_stage1_6_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") if key_hadd_stage1_6_job not in self.inputFiles_hadd_stage1_6: self.inputFiles_hadd_stage1_6[key_hadd_stage1_6_job] = [] self.inputFiles_hadd_stage1_6[key_hadd_stage1_6_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile']) self.inputFiles_hadd_stage1_6[key_hadd_stage1_6_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job]) self.outputFile_hadd_stage1_6[key_hadd_stage1_6_job] = os.path.join(self.dirs[key_hadd_stage1_6_dir][DKEY_HIST], "hadd_stage1_6_Tight_OS.root") #-------------------------------------------------------------------------- logging.info("Creating configuration files to run 'addBackgroundFlips'") key_addFlips_dir = getKey("addBackgroundLeptonFlips") key_addFlips_job = getKey("data_flips") self.jobOptions_addFlips[key_addFlips_job] = { 'inputFile' : self.outputFile_hadd_stage1_6, 'cfgFile_modified' : os.path.join(self.dirs[key_addFlips_dir][DKEY_CFGS], "addBackgroundLeptonFlips_cfg.py"), 'outputFile' : os.path.join(self.dirs[key_addFlips_dir][DKEY_HIST], "addBackgroundLeptonFlips.root"), 'logFile' : os.path.join(self.dirs[key_addFlips_dir][DKEY_LOGS], "addBackgroundLeptonFlips.log"), 'category_signal' : "ttWctrl_SS_Tight", 'category_sideband' : "ttWctrl_OS_Tight" } self.createCfg_addFlips(self.jobOptions_addFlips[key_addFlips_job]) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS") self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFlips[key_addFlips_job]['outputFile']) logging.info("Creating configuration files to run 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS") key_prep_dcard_dir = getKey("prepareDatacards") prep_dcard_job_tuple = (self.channel, histogramToFit) key_prep_dcard_job = getKey(histogramToFit) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple), 'histogramDir' : self.histogramDir_prep_dcard, 'histogramToFit' : histogramToFit, 'label' : None } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) # add shape templates for the following systematic uncertainties: # - 'CMS_ttHl_Clos_norm_e' # - 'CMS_ttHl_Clos_shape_e' # - 'CMS_ttHl_Clos_norm_m' # - 'CMS_ttHl_Clos_shape_m' key_add_syst_fakerate_dir = getKey("addSystFakeRates") add_syst_fakerate_job_tuple = (self.channel, histogramToFit) key_add_syst_fakerate_job = getKey(histogramToFit) self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = { 'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'], 'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_cfg.py" % add_syst_fakerate_job_tuple), 'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s.root" % add_syst_fakerate_job_tuple), 'category' : self.channel, 'histogramToFit' : histogramToFit, 'plots_outputFileName' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_PLOT], "addSystFakeRates.png") } histogramDir_nominal = self.histogramDir_prep_dcard for lepton_type in [ 'e', 'm' ]: lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type if lepton_mcClosure not in self.lepton_selections: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled") key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight, 'SS') histogramDir_mcClosure = self.mcClosure_dir['%s_%s' % (lepton_mcClosure, 'SS')] self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({ 'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections, 'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit), 'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'], 'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit) }) self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job]) logging.info("Creating configuration files to run 'makePlots'") key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS") key_makePlots_dir = getKey("makePlots") key_makePlots_job = getKey("SS") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel), 'histogramDir' : self.histogramDir_prep_dcard, 'label' : "t#bar{t}W control region", 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel) self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel) self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel) self.sbatchFile_addFlips = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFlips_%s.py" % self.channel) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes) self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFlips) self.createScript_sbatch(self.executable_addFlips, self.sbatchFile_addFlips, self.jobOptions_addFlips) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data_withFlips(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_add_syst_fakerate(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.addToMakefile_validate(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] for lepton_selection in self.lepton_selections: for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) central_or_shifts_extended = [ "" ] central_or_shifts_extended.extend(self.central_or_shifts) central_or_shifts_extended.extend([ "hadd", "addBackgrounds" ]) for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [ process_name, "hadd" ] for process_name_or_dummy in process_name_extended: key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_RLES, DKEY_SYNC ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight ]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight ]), process_name_or_dummy, central_or_shift_or_dummy) for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "prepareDatacards", "addSystFakeRates", "makePlots" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]: initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0; frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100*numDirectories_created >= frac*numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) mcClosure_regex = re.compile('Fakeable_mcClosure_(?P<type>m|e)_wFakeRateWeights') for lepton_selection in self.lepton_selections: electron_selection = lepton_selection muon_selection = lepton_selection hadTauVeto_selection = "Tight" hadTauVeto_selection = "|".join([ hadTauVeto_selection, self.hadTauVeto_selection_part2 ]) if lepton_selection == "Fakeable_mcClosure_e": electron_selection = "Fakeable" muon_selection = "Tight" elif lepton_selection == "Fakeable_mcClosure_m": electron_selection = "Tight" muon_selection = "Fakeable" for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight" ]: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") is_signal = (sample_category == "signal") for central_or_shift in self.central_or_shifts: inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): if central_or_shift != "central": isFR_shape_shift = (central_or_shift in systematics.FR_all) if not ((lepton_selection == "Fakeable" and isFR_shape_shift) or lepton_selection == "Tight"): continue if not is_mc and not isFR_shape_shift: continue if central_or_shift in systematics.LHE().ttH and sample_category != "signal": continue if central_or_shift in systematics.LHE().ttW and sample_category != "TTW": continue if central_or_shift in systematics.LHE().ttZ and sample_category != "TTZ": continue if central_or_shift in systematics.DYMCReweighting and not is_dymc_reweighting(sample_name): continue logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift)) # build config files for executing analysis code key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, central_or_shift) analyze_job_tuple = (process_name, lepton_selection_and_frWeight, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue syncOutput = '' syncTree = '' syncRequireGenMatching = True if self.do_sync: mcClosure_match = mcClosure_regex.match(lepton_selection_and_frWeight) if lepton_selection_and_frWeight == 'Tight': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_SR.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_SR' % self.channel syncRequireGenMatching = True elif lepton_selection_and_frWeight == 'Fakeable_wFakeRateWeights': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Fake.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_Fake' % self.channel elif mcClosure_match: mcClosure_type = mcClosure_match.group('type') syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_mcClosure_%s.root' % (self.channel, central_or_shift, mcClosure_type)) syncTree = 'syncTree_%s_mcClosure_%s' % (self.channel, mcClosure_type) else: continue if syncTree and central_or_shift != "central": syncTree = os.path.join(central_or_shift, syncTree) syncRLE = '' if self.do_sync and self.rle_select: syncRLE = self.rle_select % syncTree if not os.path.isfile(syncRLE): logging.warning("Input RLE file for the sync is missing: %s; skipping the job" % syncRLE) continue if syncOutput: self.inputFiles_sync['sync'].append(syncOutput) cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : cfgFile_modified_path, 'histogramFile' : histogramFile_path, 'logFile' : logFile_path, 'selEventsFileName_output' : rleOutputFile_path, 'electronSelection' : electron_selection, 'muonSelection' : muon_selection, 'apply_leptonGenMatching' : self.apply_leptonGenMatching, 'hadTauSelection_veto' : hadTauVeto_selection, 'applyFakeRateWeights' : self.applyFakeRateWeights if not lepton_selection == "Tight" else "disabled", 'central_or_shift' : central_or_shift, 'syncOutput' : syncOutput, 'syncTree' : syncTree, 'syncRLE' : syncRLE, 'syncRequireGenMatching' : syncRequireGenMatching, 'useNonNominal' : self.use_nonnominal, 'apply_hlt_filter' : self.hlt_filter, } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight) hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple) if self.do_sync: continue if is_mc: logging.info("Creating configuration files to run 'addBackgrounds' for sample %s" % process_name) sample_categories = [ sample_category ] if is_signal: sample_categories = [ "signal", "ttH", "ttH_htt", "ttH_hww", "ttH_hzz", "ttH_hmm", "ttH_hzg" ] for sample_category in sample_categories: # sum non-fake and fake contributions for each MC sample separately genMatch_categories = [ "nonfake", "conversions", "fake" ] for genMatch_category in genMatch_categories: key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight) key_addBackgrounds_dir = getKey(process_name, lepton_selection_and_frWeight, "addBackgrounds") addBackgrounds_job_tuple = None processes_input = None process_output = None if genMatch_category == "nonfake": # sum non-fake contributions for each MC sample separately # input processes: TT3l0g0j,... # output processes: TT; ... if sample_category in [ "signal" ]: lepton_genMatches = [] lepton_genMatches.extend(self.lepton_genMatches_nonfakes) lepton_genMatches.extend(self.lepton_genMatches_conversions) lepton_genMatches.extend(self.lepton_genMatches_fakes) processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in lepton_genMatches ] elif sample_category in [ "ttH" ]: lepton_genMatches = [] lepton_genMatches.extend(self.lepton_genMatches_nonfakes) lepton_genMatches.extend(self.lepton_genMatches_conversions) processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in lepton_genMatches ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_nonfakes ] process_output = sample_category addBackgrounds_job_tuple = (process_name, sample_category, lepton_selection_and_frWeight) elif genMatch_category == "conversions": # sum fake contributions for each MC sample separately # input processes: TT2l1g0j, TT1l2g0j, TT0l3g0j; ... # output processes: TT_conversion; ... if sample_category in [ "signal" ]: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_conversions ] elif sample_category in [ "ttH" ]: processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_conversions ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_conversions ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_conversions ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_conversions ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_conversions ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_conversions ] process_output = "%s_conversion" % sample_category addBackgrounds_job_tuple = (process_name, "%s_conversion" % sample_category, lepton_selection_and_frWeight) elif genMatch_category == "fake": # sum fake contributions for each MC sample separately # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l2g1j, TT0l1g2j, TT0l0g3j; ... # output processes: TT_fake; ... if sample_category in [ "signal" ]: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ] elif sample_category in [ "ttH" ]: processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_fakes ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ] process_output = "%s_fake" % sample_category addBackgrounds_job_tuple = (process_name, "%s_fake" % sample_category, lepton_selection_and_frWeight) if processes_input: logging.info(" ...for genMatch option = '%s'" % genMatch_category) key_addBackgrounds_job = getKey(*addBackgrounds_job_tuple) cfgFile_modified = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_tuple) outputFile = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_tuple) self.jobOptions_addBackgrounds[key_addBackgrounds_job] = { 'inputFile' : self.outputFile_hadd_stage1[key_hadd_stage1_job], 'cfgFile_modified' : cfgFile_modified, 'outputFile' : outputFile, 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(cfgFile_modified).replace("_cfg.py", ".log")), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ], 'processes_input' : processes_input, 'process_output' : process_output } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds[key_addBackgrounds_job]) # initialize input and output file names for hadd_stage1_5 key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight) key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight) if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.jobOptions_addBackgrounds[key_addBackgrounds_job]['outputFile']) self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST], "hadd_stage1_5_%s.root" % lepton_selection_and_frWeight) # add output files of hadd_stage1 for data to list of input files for hadd_stage1_5 if not is_mc: key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight) key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight) if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job]) if self.do_sync: continue # sum fake background contributions for the total of all MC sample # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l3j, TT0l3j, TT0l3j, TT0l3j; ... # output process: fakes_mc key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight) key_addBackgrounds_dir = getKey("addBackgrounds") addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight) key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend([ "signal" ]) processes_input = [] for sample_category in sample_categories: processes_input.append("%s_fake" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s.root" % addBackgrounds_job_fakes_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s.log" % addBackgrounds_job_fakes_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ], 'processes_input' : processes_input, 'process_output' : "fakes_mc" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]) # sum conversion background contributions for the total of all MC sample # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l3j, TT0l3j, TT0l3j, TT0l3j; ... # output process: conversions addBackgrounds_job_conversions_tuple = ("conversions", lepton_selection_and_frWeight) key_addBackgrounds_job_conversions = getKey(*addBackgrounds_job_conversions_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend([ "signal" ]) processes_input = [] for sample_category in sample_categories: processes_input.append("%s_conversion" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_cfg.py" % addBackgrounds_job_conversions_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s.root" % addBackgrounds_job_conversions_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s.log" % addBackgrounds_job_conversions_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ], 'processes_input' : processes_input, 'process_output' : "conversions" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions]) # initialize input and output file names for hadd_stage2 key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight) key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight) key_hadd_stage2_job = getKey(lepton_selection_and_frWeight) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] if lepton_selection == "Tight": self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job]) self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s.root" % lepton_selection_and_frWeight) if self.do_sync: if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_syncNtuple(lines_makefile) outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel) self.outputFile_sync['sync'] = outputFile_sync_path self.targets.append(outputFile_sync_path) self.addToMakefile_hadd_sync(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs logging.info("Creating configuration files to run 'addBackgroundFakes'") key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled")) key_addFakes_job = getKey("fakes_data") category_sideband = "ttZctrl_Fakeable_wFakeRateWeights" self.jobOptions_addFakes[key_addFakes_job] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[DKEY_CFGS], "addBackgroundLeptonFakes_cfg.py"), 'outputFile' : os.path.join(self.dirs[DKEY_HIST], "addBackgroundLeptonFakes.root"), 'logFile' : os.path.join(self.dirs[DKEY_LOGS], "addBackgroundLeptonFakes.log"), 'category_signal' : "ttZctrl_Tight", 'category_sideband' : category_sideband } self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job]) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled")) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile']) logging.info("Creating configuration files to run 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled")) key_prep_dcard_dir = getKey("prepareDatacards") prep_dcard_job_tuple = (self.channel, histogramToFit) key_prep_dcard_job = getKey(histogramToFit) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple), 'histogramDir' : self.histogramDir_prep_dcard, 'histogramToFit' : histogramToFit, 'label' : None } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) # add shape templates for the following systematic uncertainties: # - 'CMS_ttHl_Clos_norm_e' # - 'CMS_ttHl_Clos_shape_e' # - 'CMS_ttHl_Clos_norm_m' # - 'CMS_ttHl_Clos_shape_m' key_prep_dcard_job = getKey(histogramToFit) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled")) key_add_syst_fakerate_dir = getKey("addSystFakeRates") add_syst_fakerate_job_tuple = (self.channel, histogramToFit) key_add_syst_fakerate_job = getKey(histogramToFit) self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = { 'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'], 'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_cfg.py" % add_syst_fakerate_job_tuple), 'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s.root" % add_syst_fakerate_job_tuple), 'category' : self.channel, 'histogramToFit' : histogramToFit, 'plots_outputFileName' : os.path.join(self.dirs[DKEY_PLOT], "addSystFakeRates.png") } histogramDir_nominal = self.histogramDir_prep_dcard for lepton_type in [ 'e', 'm' ]: lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type if lepton_mcClosure not in self.lepton_selections: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled") key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight) histogramDir_mcClosure = self.mcClosure_dir[lepton_mcClosure] self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({ 'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections, 'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit), 'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'], 'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit) }) self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job]) logging.info("Creating configuration files to run 'makePlots'") key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled")) key_makePlots_dir = getKey("makePlots") key_makePlots_job = getKey('') self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel), 'histogramDir' : self.histogramDir_prep_dcard, 'label' : "t#bar{t}Z control region", 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds) self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds) self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes) self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel) self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_add_syst_fakerate(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
process_name = args.process category_name = args.category binning_choice = args.binning genweight_ref = args.gen_weight use_genweight = genweight_ref != 0. if args.verbose: logging.getLogger().setLevel(logging.DEBUG) if not os.path.isfile(input_fn): raise ValueError("No such file: %s" % input_fn) output_dir = os.path.dirname(output_fn) if not os.path.isdir(output_dir): logging.warning("Creating directory: {}".format(output_dir)) os.makedirs(output_dir) input_file = ROOT.TFile.Open(input_fn, 'read') assert (input_file) input_tree = input_file.Get('Events') denominator_process = Hist2D(BINNING_MHH[binning_choice], BINNING_COSTHETASTAR, name=process_name) denominator_category = Hist2D(BINNING_MHH[binning_choice], BINNING_COSTHETASTAR, name=category_name) nof_events = input_tree.GetEntries() logging.debug("Input file {} has {} events".format(input_fn, nof_events))
raise ValueError("Invalid mode: %s" % mode) for sample_name, sample_info in samples.items(): if sample_name == 'sum_events': continue if sample_name.startswith(("/DoubleEG/", "/DoubleMuon/", "/MuonEG/")): sample_info["use_it"] = False elif sample_name.startswith("/Tau/"): sample_info["use_it"] = True if __name__ == '__main__': logging.info( "Running the jobs with the following systematic uncertainties enabled: %s" % \ ', '.join(central_or_shifts) ) if not use_preselected: logging.warning('Running the analysis on fully inclusive samples!') if sample_filter: samples = filter_samples(samples, sample_filter) if args.tau_id_wp: logging.info("Changing tau ID working point: %s -> %s" % (hadTau_selection, args.tau_id_wp)) hadTau_selection = args.tau_id_wp analysis = analyzeConfig_hh_1l_3tau( configDir=os.path.join("/scratch-persistent", getpass.getuser(), "hhAnalysis", era, version), localDir=os.path.join("/home", getpass.getuser(), "hhAnalysis", era, version), outputDir=os.path.join("/hdfs/local", getpass.getuser(), "hhAnalysis",
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(process_name) for dir_type in [DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, process_name) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0 frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100 * numDirectories_created >= frac * numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_info, self.max_files_per_job) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): ##print "processing sample %s: jobId = %i" % (process_name, jobId) # build config files for executing analysis code key_analyze_dir = getKey(process_name) analyze_job_tuple = (process_name, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning( "No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue cfgFile_modified_path = os.path.join( self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%i.log" % analyze_job_tuple) histogramFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%i.root" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles': ntupleFiles, 'cfgFile_modified': cfgFile_modified_path, 'histogramFile': histogramFile_path, 'histogramDir': 'analyze_hadTopTagger', 'logFile': logFile_path, 'hadTauSelection': self.hadTau_selection, 'lumiScale': 1., 'selectBDT': True, } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job], sample_info) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight) key_hadd_stage1_job = getKey(process_name) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append( self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[ key_hadd_stage1_job] = os.path.join( self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s.root" % process_name) self.targets.append( self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def create(self): """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info['use_it']: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") if not is_mc: continue logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable, process_name)) inputFileList = generateInputFileList(sample_info, self.max_files_per_job) key_dir = getKey(process_name) outputFile = os.path.join(self.dirs[key_dir][DKEY_HISTO], "%s.root" % process_name) self.outputFiles[process_name] = { 'inputFiles': [], 'outputFile': outputFile, } if os.path.isfile(outputFile) and tools_is_file_ok( outputFile, min_file_size=2000): logging.info('File {} already exists --> skipping job'.format( outputFile)) continue for jobId in inputFileList.keys(): key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = inputFileList[jobId] if len(self.inputFiles[key_file]) == 0: logging.warning("'%s' = %s --> skipping job !!" % (key_file, self.inputFiles[key_file])) continue self.cfgFiles_projection[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "project_%s_%i_cfg.txt" % (process_name, jobId)) self.outputFiles_tmp[key_file] = os.path.join( self.dirs[key_dir][DKEY_HISTO_TMP], "histogram_%i.root" % jobId) self.logFiles_projection[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "project_%s_%i.log" % (process_name, jobId)) self.scriptFiles_projection[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "project_%s_%i_cfg.sh" % (process_name, jobId)) projection_module = self.projection_module if projection_module == "count": projection_module = "countHistogramAll" if sample_name.startswith('/TTTo'): projection_module += "CompTopRwgt" elif sample_info['sample_category'].startswith('ttH'): projection_module += "CompHTXS" elif isSplitByNlheJet(process_name): projection_module += "SplitByLHENjet" elif isSplitByNlheHT(process_name): projection_module += "SplitByLHEHT" elif isSplitByNlheJetHT(process_name, sample_name): projection_module += "SplitByLHENjetHT" self.jobOptions_sbatch[key_file] = { 'histName': process_name, 'inputFiles': self.inputFiles[key_file], 'cfgFile_path': self.cfgFiles_projection[key_file], 'outputFile': self.outputFiles_tmp[key_file], 'logFile': self.logFiles_projection[key_file], 'scriptFile': self.scriptFiles_projection[key_file], 'projection_module': projection_module, } if self.projection_module != 'puHist': self.jobOptions_sbatch[key_file][ 'ref_genWeight'] = self.ref_genWeights[process_name] if process_name not in self.ref_genWeights: raise RuntimeError( "Unable to find reference LHE weight for process %s" % process_name) self.createCfg_project(self.jobOptions_sbatch[key_file]) self.outputFiles[process_name]['inputFiles'].append( self.outputFiles_tmp[key_file]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable) self.num_jobs['project'] += self.createScript_sbatch( self.executable, self.sbatchFile_projection, self.jobOptions_sbatch) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_project(lines_makefile) self.addToMakefile_hadd(lines_makefile) if self.plot: self.addToMakefile_plot(lines_makefile) self.addToMakefile_finalHadd(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def run_brilcalc(hlt_paths_in, json, normtag, units, brilcalc_path, data_file, output_dir): assert (all(map(lambda hlt_path: hlt_path.startswith('HLT'), hlt_paths_in))) hlt_paths = {hlt_path: hlt_version(hlt_path) for hlt_path in hlt_paths_in} for input_file in (json, normtag): if input_file and not os.path.isfile(input_file): raise ValueError("No such file: %s" % input_file) if not os.path.isfile(brilcalc_path): raise ValueError("No such file: %s" % brilcalc_path) if data_file: data = parse_data(data_file) if data['normtag'] != os.path.basename(normtag): logging.warning( "File {} is generated with normtag '{}' but requested using normtag '{}'" .format(data_file, data['normtag'], normtag)) if data['json'] != os.path.basename(json): logging.warning( "File {} is generated with JSON '{}' but requested using JSON '{}'" .format(data_file, data['json'], json)) else: data = None if output_dir and not os.path.isdir(output_dir): os.makedirs(output_dir) # prepare the jobs pool_size = 16 pool = multiprocessing.Pool(pool_size, handle_worker) logging.debug("Constructing pool for {} HLT paths".format(len(hlt_paths))) for hlt_path in hlt_paths: pool.apply_async( process_hlt, args=(hlt_paths[hlt_path], json, brilcalc_path, normtag, units, output_dir), callback=get_trigger_results, ) pool.close() pool.join() logging.debug("Pool finished") # parse trigger_results for hlt_path in hlt_paths: dict_entry = trigger_results[hlt_paths[hlt_path]] if data_file: present_eras = [] for run in dict_entry['runs']: for era in data['runs']: if data['runs'][era]['run_start'] <= run <= data['runs'][ era]['run_end'] and era not in present_eras: present_eras.append(era) all_eras = [era for era in data['runs']] missing_eras = list(sorted(list(set(all_eras) - set(present_eras)))) expected_recording = data['totrecorded'] expected_delivery = data['totdelivered'] data_units = data['units'] unit_factor = 1000**(LUMI_UNITS.index(units) - LUMI_UNITS.index(data_units)) expected_recording *= unit_factor expected_delivery *= unit_factor prescale_recording = (expected_recording / dict_entry['recorded'] ) if dict_entry['recorded'] != 0. else -1. prescale_delivery = (expected_delivery / dict_entry['delivered'] ) if dict_entry['delivered'] != 0. else -1. if int(prescale_recording) == 1: prescale_msg = "NOT prescaled" elif int(prescale_recording) == -1: prescale_msg = "NOT recording anything?" else: prescale_msg = "prescale factor %.1f (%.1f from delivery)" % ( prescale_recording, prescale_delivery) prescale_msg += " (expected %.1f recorded, %.1f delivery; units = %s)" % ( expected_recording, expected_delivery, units) print("{} nrun = {} totdelivered = {} totrecorded = {} (units = {})". format( hlt_path, len(dict_entry['runs']), dict_entry['delivered'], dict_entry['recorded'], units, )) if data_file: print("{} present in eras: {} (missing in {} eras) => {}".format( hlt_path, ", ".join(present_eras), ", ".join(missing_eras) if missing_eras else "none of the", prescale_msg, )) for hlt_dict in dict_entry['paths']: print( "\t{} nfill = {} nrun = {} ncms = {} totdelivered = {} totrecorded = {}" .format( hlt_dict['hltpath'], hlt_dict['nfill'], hlt_dict['nrun'], hlt_dict['ncms'], hlt_dict['totdelivered'], hlt_dict['totrecorded'], ))