Esempio n. 1
0
def generate_sbatch_lines(
    executable,
    command_line_parameters,
    input_file_names,
    output_file_names,
    script_file_names,
    log_file_names,
    working_dir,
    max_num_jobs,
    cvmfs_error_log=None,
    pool_id='',
    cmssw_base_dir=None,
    verbose=False,
    job_template_file='sbatch-node.sh.template',
    dry_run=False,
    validate_outputs=True,
    min_file_size=20000,
    use_home=True,
):
    if not pool_id:
        raise ValueError('pool_id is empty')
    lines_sbatch = [
        "from tthAnalysis.HiggsToTauTau.sbatchManager import sbatchManager",
        "",
        "m = sbatchManager('%s', verbose = %s, dry_run = %s, use_home = %s, min_file_size = %d)" % \
          (pool_id, verbose, dry_run, use_home, min_file_size),
        "m.setWorkingDir('%s')"     % working_dir,
        "m.setcmssw_base_dir('%s')" % cmssw_base_dir,
        "m.log_completion = %s"     % verbose,
    ]

    num_jobs = 0
    for key_file, command_line_parameter in command_line_parameters.items():
        log_file_name = None
        if log_file_names:
            log_file_name = log_file_names[key_file]
        if num_jobs <= max_num_jobs or max_num_jobs <= 0:
            sbatch_line = generate_sbatch_line(
                executable=executable,
                command_line_parameter=command_line_parameter,
                input_file_names=input_file_names[key_file],
                output_file_name=output_file_names[key_file],
                script_file_name=script_file_names[key_file],
                log_file_name=log_file_name,
                cvmfs_error_log=cvmfs_error_log,
                job_template_file=job_template_file,
                validate_outputs=validate_outputs,
                min_file_size=min_file_size,
            )
            if sbatch_line:
                lines_sbatch.append(sbatch_line)
                num_jobs = num_jobs + 1
    if max_num_jobs > 0 and num_jobs > max_num_jobs:
        logging.warning(
          "number of jobs = %i exceeds limit of %i --> skipping submission of %i jobs !!" % \
          (num_jobs, max_num_jobs, num_jobs - max_num_jobs)
        )

    lines_sbatch.append("m.waitForJobs()")
    return lines_sbatch, num_jobs
Esempio n. 2
0
    def submit(self, cmd_str):
        nof_max_retries = 10
        current_retry = 0
        while current_retry < nof_max_retries:
            # Run command
            cmd_outerr = run_cmd(cmd_str, return_stderr=True)
            try:
                job_id = cmd_outerr[0].split()[-1]
                break
            except IndexError:
                # Fails if stdout returned by the last line is empty
                logging.warning(
                    "Caught an error: '%s'; resubmitting %i-th time" %
                    (cmd_outerr[1], current_retry))
                current_retry += 1
                logging.debug("sleeping for %i seconds." % 60)
                time.sleep(
                    60
                )  # Let's wait for 60 seconds until the next resubmission

        # The job ID must be a number, so.. we have to check if it really is one
        try:
            int(job_id)
        except ValueError:
            raise ValueError("job_id = '%s' NaN; sbatch stdout = '%s'; sbatch stderr = '%s'" % \
                              (job_id, cmd_outerr[0], cmd_outerr[1]))
        if job_id in self.submittedJobs:
            raise RuntimeError("Same job ID: %s" % job_id)
        # Is a valid job ID
        return job_id
Esempio n. 3
0
def check_submission_cmd(submission_out, submission_cmd, throw=False):
    earlier_submission_out = find_earlier_version(submission_out)
    current_submission = ' '.join(submission_cmd) if submission_cmd else str(
        submission_cmd)
    current_submission_stripped = current_submission.replace(' -A',
                                                             '').replace(
                                                                 ' -E', '')
    if earlier_submission_out:
        with open(earlier_submission_out, 'r') as earlier_submission_file:
            lines = []
            for line in earlier_submission_file:
                lines.append(line.rstrip('\n'))
            assert (len(lines) == 1)
            previous_submission = lines[0]
            previous_submission_stripped = previous_submission.replace(
                ' -A', '').replace(' -E', '')
        if previous_submission_stripped != current_submission_stripped:
            logging.warning(
                "Current command ('{}') does not match to the previously run command ('{}')"
                .format(current_submission, previous_submission))
            if throw:
                sys.exit(1)
            do_run = query_yes_no(
                "Sure you want to resubmit with a different command?",
                default="no")
            if not do_run:
                logging.info('Exiting')
                sys.exit(0)
    with open(submission_out, 'w') as submission_out_file:
        submission_out_file.write('{}\n'.format(current_submission))
Esempio n. 4
0
 def get_dir_entries(self, path_obj):
   if not os.path.isdir(path_obj.name_fuse):
     raise hdfsException("No such path: %s" % path_obj.name_fuse)
   entries = []
   try:
     entries = [
       nohdfs.info(os.path.join(path_obj.name_fuse, entry)) for entry in os.listdir(path_obj.name_fuse)
     ]
   except OSError as err:
     if err.errno == errno.EAGAIN:
       logging.warning('Could not access path %s because: %s' % (path_obj.name_fuse, err))
       entries = []
   except Exception as err:
     raise ValueError('Could not access path %s because: %s' % (path_obj.name_fuse, err))
   return entries
Esempio n. 5
0
def get_ratios(wobtag_count, wbtag_count, wobtag_label, wbtag_label):
    ratios = []
    for bin_idx in range(len(wobtag_count)):
        wobtag_count_bin_idx = wobtag_count[bin_idx]
        wbtag_count_bin_idx = wbtag_count[bin_idx]
        ratio = 1.

        if wobtag_count_bin_idx == 0. and wbtag_count_bin_idx == 0.:
            pass
        elif wbtag_count_bin_idx == 0.:
            if abs(wobtag_count_bin_idx) > 1e-2:
                raise RuntimeError(
                  'Found bin idx %d in histogram %s with zero events but %.2f events in histogram %s' % \
                  (bin_idx, wbtag_label, wobtag_count_bin_idx, wobtag_label)
                )
            else:
                ratio = 1.
        elif wobtag_count_bin_idx == 0.:
            if abs(wobtag_count_bin_idx) > 1e-2:
                raise RuntimeError(
                  'Found bin idx %d in histogram %s with zero events but %.2f events in histogram %s' % \
                  (bin_idx, wobtag_label, wbtag_count_bin_idx, wbtag_label)
                )
            else:
                ratio = 1.
        else:
            ratio = wobtag_count_bin_idx / wbtag_count_bin_idx
            if ratio < 0.:
                logging.warning(
                    'Found event sums with opposite sign at bin {} in histograms {} and {}: {:.2f} and {:.2f} '
                    '-> setting ratio to 1 instead'.format(
                        bin_idx, wobtag_label, wbtag_label,
                        wobtag_count_bin_idx, wbtag_count_bin_idx))
                ratio = 1.
        ratios.append(ratio)
    return ratios
Esempio n. 6
0
def get_rles(input_paths, whitelist, blacklist, read_all_systematics):
    has_errors = False
    rles = collections.OrderedDict()
    valid_paths = get_paths(input_paths, whitelist, blacklist)
    for channel_name, channel_dir in valid_paths.items():
        rles[channel_name] = collections.OrderedDict()
        for region_dir in sorted(hdfs.listdir(channel_dir)):
            region_name = os.path.basename(region_dir)
            logging.debug('Found region {} in channel {}'.format(
                channel_name, region_name))
            rles[channel_name][region_name] = collections.OrderedDict()
            for sample_dir in sorted(hdfs.listdir(region_dir)):
                sample_name = os.path.basename(sample_dir)
                if sample_name in SAMPLES_EXCLUDE:
                    continue
                logging.debug(
                    'Found sample {} in region {} and channel {}'.format(
                        sample_name, region_name, channel_name))
                rles[channel_name][region_name][
                    sample_name] = collections.OrderedDict()
                for rle_dir in sorted(hdfs.listdir(sample_dir)):
                    central_or_shift = os.path.basename(rle_dir)
                    if central_or_shift in SYSTEMATICS_EXCLUDE:
                        continue
                    if not read_all_systematics and central_or_shift != SYSTEMATICS_CENTRAL:
                        continue
                    logging.debug(
                        'Found systematics {} for sample {} in region {} and channel {}'
                        .format(central_or_shift, sample_name, region_name,
                                channel_name))
                    rles[channel_name][region_name][sample_name][
                        central_or_shift] = []
                    rle_filenames = sorted(hdfs.listdir(rle_dir))
                    if not rle_filenames:
                        logging.warning(
                            'Directory {} is empty'.format(rle_dir))
                        continue
                    rle_arr = []
                    for rle_filename in rle_filenames:
                        if not rle_filename.endswith('.txt'):
                            raise RuntimeError(
                                "Unexpected extension in file: %s" %
                                rle_filename)
                        with open(rle_filename, 'r') as rle_file:
                            for line in rle_file:
                                line_stripped = line.rstrip('\n')
                                if not REGEX_RLE.match(line_stripped):
                                    raise RuntimeError(
                                        "Unexpected line found in %s: %s" %
                                        (rle_filename, line_stripped))
                                rle = line_stripped
                                if rle in rle_arr:
                                    logging.error(
                                      "Duplicate event %s found in channel %s, region %s, sample %s, systematics %s" % \
                                      (rle, channel_name, region_name, sample_name, central_or_shift)
                                    )
                                    has_errors = True
                                    continue
                                rle_arr.append(rle)
                    logging.debug(
                        'Found {} events in sample {}, region {}, systematics {}, channel {}'
                        .format(len(rle_arr), sample_name, region_name,
                                central_or_shift, channel_name))
                    rles[channel_name][region_name][sample_name][
                        central_or_shift].extend(rle_arr)
    return rles, has_errors
Esempio n. 7
0
    def create(self):
        """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue

            process_name = sample_info["process_name_specific"]
            is_mc = (sample_info["type"] == "mc")

            if not is_mc:
                continue

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable, process_name))

            inputFileList_map = generateInputFileList(sample_info, 1)
            key_dir = getKey(process_name)
            key_file = getKey(process_name)

            self.inputFiles[key_file] = list(
                itertools.chain(*inputFileList_map.values()))
            if len(self.inputFiles[key_file]) == 0:
                logging.warning("'%s' = %s --> skipping job !!" %
                                (key_file, self.inputFiles[key_file]))
                continue

            outputFile = os.path.join(self.dirs[key_dir][DKEY_RESULTS],
                                      "%s.txt" % process_name)
            self.outputFiles[key_file] = outputFile
            if os.path.isfile(outputFile):
                logging.info('File {} already exists --> skipping job'.format(
                    outputFile))
                continue

            self.cfgFiles[key_file] = os.path.join(
                self.dirs[key_dir][DKEY_CFGS],
                "refGenWeight_%s_cfg.txt" % (process_name))
            self.logFiles[key_file] = os.path.join(
                self.dirs[key_dir][DKEY_LOGS],
                "refGenWeight_%s.log" % (process_name))
            self.scriptFiles[key_file] = os.path.join(
                self.dirs[key_dir][DKEY_CFGS],
                "refGenWeight_%s_cfg.sh" % (process_name))
            self.plotFiles[key_file] = ' '.join([
                os.path.join(self.dirs[key_dir][DKEY_PLOTS],
                             "refGenWeight_%s.%s" % (process_name, extension))
                for extension in ['pdf', 'png']
            ])

            self.jobOptions_sbatch[key_file] = {
                'inputFiles':
                self.inputFiles[key_file],
                'cfgFile_path':
                self.cfgFiles[key_file],
                'cmdParams':
                "-i {} -o {} -p {} -v".format(
                    self.cfgFiles[key_file],
                    self.outputFiles[key_file],
                    self.plotFiles[key_file],
                ),
                'outputFile':
                self.outputFiles[key_file],
                'logFile':
                self.logFiles[key_file],
                'scriptFile':
                self.scriptFiles[key_file],
            }
            self.createCfg(self.jobOptions_sbatch[key_file])

        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable)
            self.num_jobs['refGenWeight'] += self.createScript_sbatch(
                self.executable, self.sbatchFile, self.jobOptions_sbatch)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile(lines_makefile)
        self.addToMakefile_final(lines_makefile)
        self.createMakefile(lines_makefile)
        logging.info("Done")

        return self.num_jobs
Esempio n. 8
0
    def create(self):
        """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system
    """

        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue
            process_name = sample_info["process_name_specific"]
            sample_category = sample_info["sample_category"]
            is_mc = (sample_info["type"] == "mc")

            logging.info("Building dictionaries for sample %s..." %
                         process_name)
            for charge_selection in self.charge_selections:
                central_or_shift_extensions = ["", "hadd", "addBackgrounds"]
                central_or_shifts_extended = central_or_shift_extensions + self.central_or_shifts
                for central_or_shift_or_dummy in central_or_shifts_extended:
                    process_name_extended = [process_name, "hadd"]
                    for process_name_or_dummy in process_name_extended:
                        if central_or_shift_or_dummy in [
                                "hadd"
                        ] and process_name_or_dummy in ["hadd"]:
                            continue
                        if central_or_shift_or_dummy != "central" and central_or_shift_or_dummy not in central_or_shift_extensions:
                            if not is_mc:
                                continue
                            if not self.accept_central_or_shift(
                                    central_or_shift_or_dummy, sample_info):
                                continue

                        key_dir = getKey(process_name_or_dummy,
                                         charge_selection,
                                         central_or_shift_or_dummy)
                        for dir_type in [
                                DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES
                        ]:
                            initDict(self.dirs, [key_dir, dir_type])
                            if dir_type in [DKEY_CFGS, DKEY_LOGS]:
                                self.dirs[key_dir][dir_type] = os.path.join(
                                    self.configDir, dir_type, self.channel,
                                    "_".join([charge_selection]),
                                    process_name_or_dummy,
                                    central_or_shift_or_dummy)
                            else:
                                self.dirs[key_dir][dir_type] = os.path.join(
                                    self.outputDir, dir_type, self.channel,
                                    "_".join([charge_selection]),
                                    process_name_or_dummy)
        for subdirectory in ["comp_jetToTauFakeRate", "makePlots"]:
            key_dir = getKey(subdirectory)
            for dir_type in [
                    DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT
            ]:
                initDict(self.dirs, [key_dir, dir_type])
                if dir_type in [DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT]:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.configDir, dir_type, self.channel, subdirectory)
                else:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.outputDir, dir_type, self.channel, subdirectory)
        for dir_type in [
                DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD,
                DKEY_PLOT, DKEY_HADD_RT
        ]:
            initDict(self.dirs, [dir_type])
            if dir_type in [
                    DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT,
                    DKEY_HADD_RT
            ]:
                self.dirs[dir_type] = os.path.join(self.configDir, dir_type,
                                                   self.channel)
            else:
                self.dirs[dir_type] = os.path.join(self.outputDir, dir_type,
                                                   self.channel)

        numDirectories = 0
        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                numDirectories += len(self.dirs[key])
            else:
                numDirectories += 1
        logging.info("Creating directory structure (numDirectories = %i)" %
                     numDirectories)
        numDirectories_created = 0
        frac = 1
        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
                numDirectories_created += len(self.dirs[key])
            else:
                create_if_not_exists(self.dirs[key])
                numDirectories_created = numDirectories_created + 1
            while 100 * numDirectories_created >= frac * numDirectories:
                logging.info(" %i%% completed" % frac)
                frac = frac + 1
        logging.info("Done.")

        inputFileLists = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue
            logging.info("Checking input files for sample %s" %
                         sample_info["process_name_specific"])
            inputFileLists[sample_name] = generateInputFileList(
                sample_info, self.max_files_per_job)

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue

            process_name = sample_info["process_name_specific"]
            inputFileList = inputFileLists[sample_name]

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable_analyze, process_name))

            is_mc = (sample_info["type"] == "mc")
            sample_category = sample_info["sample_category"]

            for charge_selection in self.charge_selections:
                for central_or_shift in self.central_or_shifts:

                    if central_or_shift != "central" and not is_mc:
                        continue
                    if not self.accept_central_or_shift(
                            central_or_shift, sample_info):
                        continue

                    # build config files for executing analysis code
                    key_analyze_dir = getKey(process_name, charge_selection,
                                             central_or_shift)

                    for jobId in inputFileList.keys():

                        analyze_job_tuple = (process_name, charge_selection,
                                             central_or_shift, jobId)
                        key_analyze_job = getKey(*analyze_job_tuple)
                        ntupleFiles = inputFileList[jobId]
                        if len(ntupleFiles) == 0:
                            logging.warning(
                                "No input ntuples for %s --> skipping job !!" %
                                (key_analyze_job))
                            continue

                        cfgFile_modified_path = os.path.join(
                            self.dirs[key_analyze_dir][DKEY_CFGS],
                            "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple)
                        logFile_path = os.path.join(
                            self.dirs[key_analyze_dir][DKEY_LOGS],
                            "analyze_%s_%s_%s_%i.log" % analyze_job_tuple)
                        histogramFile_path = os.path.join(
                            self.dirs[key_analyze_dir][DKEY_HIST],
                            "analyze_%s_%s_%s_%i.root" % analyze_job_tuple)
                        rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \
                          if self.select_rle_output else ""

                        self.jobOptions_analyze[key_analyze_job] = {
                            'ntupleFiles': ntupleFiles,
                            'cfgFile_modified': cfgFile_modified_path,
                            'histogramFile': histogramFile_path,
                            'logFile': logFile_path,
                            'chargeSelection': charge_selection,
                            'jet_minPt': self.jet_minPt,
                            'jet_maxPt': self.jet_maxPt,
                            'jet_minAbsEta': self.jet_minAbsEta,
                            'jet_maxAbsEta': self.jet_maxAbsEta,
                            'hadTau_selection_tight':
                            self.hadTau_selection_tight,
                            'hadTauSelection_denominator':
                            self.hadTau_selection_denominator,
                            'hadTauSelections_numerator':
                            self.hadTau_selections_numerator,
                            'trigMatchingOptions': self.trigMatchingOptions,
                            'selEventsFileName_output': rleOutputFile_path,
                            'absEtaBins': self.absEtaBins,
                            'decayModes': self.decayModes,
                            'central_or_shift': central_or_shift,
                            'central_or_shifts_local': [],
                            'apply_hlt_filter': self.hlt_filter,
                        }
                        self.createCfg_analyze(
                            self.jobOptions_analyze[key_analyze_job],
                            sample_info)

                        # initialize input and output file names for hadd_stage1
                        key_hadd_stage1_dir = getKey(process_name,
                                                     charge_selection)
                        hadd_stage1_job_tuple = (process_name,
                                                 charge_selection)
                        key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple)
                        if not key_hadd_stage1_job in self.inputFiles_hadd_stage1:
                            self.inputFiles_hadd_stage1[
                                key_hadd_stage1_job] = []
                        self.inputFiles_hadd_stage1[
                            key_hadd_stage1_job].append(
                                self.jobOptions_analyze[key_analyze_job]
                                ['histogramFile'])
                        self.outputFile_hadd_stage1[
                            key_hadd_stage1_job] = os.path.join(
                                self.dirs[key_hadd_stage1_dir][DKEY_HIST],
                                "hadd_stage1_%s_%s.root" %
                                hadd_stage1_job_tuple)

                # initialize input and output file names for hadd_stage2
                key_hadd_stage1_job = getKey(process_name, charge_selection)
                key_hadd_stage2_dir = getKey("hadd", charge_selection)
                key_hadd_stage2_job = getKey(charge_selection)
                if not key_hadd_stage2_job in self.inputFiles_hadd_stage2:
                    self.inputFiles_hadd_stage2[key_hadd_stage2_job] = []
                self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(
                    self.outputFile_hadd_stage1[key_hadd_stage1_job])
                self.outputFile_hadd_stage2[
                    key_hadd_stage2_job] = os.path.join(
                        self.dirs[key_hadd_stage2_dir][DKEY_HIST],
                        "hadd_stage2_%s.root" % charge_selection)

        logging.info(
            "Creating configuration files for executing 'comp_jetToTauFakeRate'"
        )
        for charge_selection in self.charge_selections:
            charge_key = "comp_%s" % charge_selection
            self.comp_input_files[charge_key] = []
            for trigMatchingOption in self.trigMatchingOptions:
                key_hadd_stage2_job = getKey(charge_selection)
                key_comp_jetToTauFakeRate_dir = getKey("comp_jetToTauFakeRate")
                key_comp_jetToTauFakeRate_job = getKey(charge_selection,
                                                       trigMatchingOption)
                self.jobOptions_comp_jetToTauFakeRate[
                    key_comp_jetToTauFakeRate_job] = {
                        'inputFile':
                        self.outputFile_hadd_stage2[key_hadd_stage2_job],
                        'cfgFile_modified':
                        os.path.join(
                            self.dirs[DKEY_CFGS],
                            "comp_jetToTauFakeRate_%s_%s_cfg.py" %
                            (charge_selection, trigMatchingOption)),
                        'outputFile':
                        os.path.join(
                            self.dirs[DKEY_HIST],
                            "comp_jetToTauFakeRate_%s_%s.root" %
                            (charge_selection, trigMatchingOption)),
                        'logFile':
                        os.path.join(
                            self.dirs[DKEY_LOGS],
                            "comp_jetToTauFakeRate_%s_%s.log" %
                            (charge_selection, trigMatchingOption)),
                        'looseRegion':
                        "jetToTauFakeRate_%s_%s/denominator/" %
                        (charge_selection, trigMatchingOption),
                        'tightRegion':
                        "jetToTauFakeRate_%s_%s/numerator/" %
                        (charge_selection, trigMatchingOption),
                        'absEtaBins':
                        self.absEtaBins,
                        'ptBins':
                        self.ptBins,
                        'decayModes':
                        self.decayModes,
                        'hadTauSelections':
                        self.hadTau_selections_numerator,
                        'trigMatchingOption':
                        trigMatchingOption,
                        'plots_outputFileName':
                        os.path.join(
                            self.dirs[key_comp_jetToTauFakeRate_dir]
                            [DKEY_PLOT], "comp_jetToTauFakeRate_%s.png" %
                            trigMatchingOption)
                    }
                self.createCfg_comp_jetToTauFakeRate(
                    self.jobOptions_comp_jetToTauFakeRate[
                        key_comp_jetToTauFakeRate_job])
                comp_output = self.jobOptions_comp_jetToTauFakeRate[
                    key_comp_jetToTauFakeRate_job]['outputFile']
                self.targets.append(comp_output)
                self.comp_input_files[charge_key].append(comp_output)
            self.comp_output_files[charge_key] = os.path.join(
                self.dirs[DKEY_HIST],
                "comp_jetToTauFakeRate_%s.root" % charge_selection)

        logging.info("Creating configuration files to run 'makePlots'")
        for charge_selection in self.charge_selections:
            key_hadd_stage2_job = getKey(charge_selection)
            key_makePlots_dir = getKey("makePlots")
            key_makePlots_job = getKey(charge_selection)
            self.jobOptions_make_plots[key_makePlots_job] = {
                'executable':
                self.executable_make_plots,
                'inputFile':
                self.outputFile_hadd_stage2[key_hadd_stage2_job],
                'cfgFile_modified':
                os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS],
                             "makePlots_%s_cfg.py" % self.channel),
                'outputFile':
                os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT],
                             "makePlots_%s.png" % self.channel),
                'histogramDir':
                "jetToTauFakeRate_%s" % charge_selection,
                'label':
                None,
                'make_plots_backgrounds':
                self.make_plots_backgrounds
            }
            self.createCfg_makePlots(
                self.jobOptions_make_plots[key_makePlots_job])
            for trigMatchingOption in self.trigMatchingOptions:
                self.cfgFile_make_plots = self.cfgFile_make_plots_denominator
                for absEtaBin in ["absEtaLt1_5", "absEta1_5to9_9"]:
                    key_hadd_stage2_job = getKey(charge_selection)
                    key_makePlots_job = getKey(charge_selection,
                                               trigMatchingOption, absEtaBin,
                                               "denominator")
                    self.jobOptions_make_plots[key_makePlots_job] = {
                      'executable' : self.executable_make_plots,
                      'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
                      'cfgFile_modified' : os.path.join(
                        self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_%s_%s_denominator_%s_cfg.py" % \
                          (self.channel, charge_selection, trigMatchingOption, absEtaBin)),
                      'outputFile' : os.path.join(
                        self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_%s_%s_denominator_%s.png" % (self.channel, charge_selection, trigMatchingOption, absEtaBin)),
                      'histogramDir' : "jetToTauFakeRate_%s_%s/denominator/%s" % (charge_selection, trigMatchingOption, absEtaBin),
                      'label' : None,
                      'make_plots_backgrounds' : self.make_plots_backgrounds
                    }
                    self.createCfg_makePlots(
                        self.jobOptions_make_plots[key_makePlots_job])
                    for hadTau_selection_numerator in self.hadTau_selections_numerator:
                        key_hadd_stage2_job = getKey(charge_selection)
                        key_makePlots_job = getKey(charge_selection,
                                                   trigMatchingOption,
                                                   absEtaBin, "numerator",
                                                   hadTau_selection_numerator)
                        self.jobOptions_make_plots[key_makePlots_job] = {
                          'executable' : self.executable_make_plots,
                          'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
                          'cfgFile_modified' : os.path.join(
                            self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_%s_%s_numerator_%s_%s_cfg.py" % \
                              (self.channel, charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin)),
                          'outputFile' : os.path.join(
                            self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_%s_%s_numerator_%s_%s.png" % \
                              (self.channel, charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin)),
                          'histogramDir' : "jetToTauFakeRate_%s_%s/numerator/%s/%s" % (charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin),
                          'label' : None,
                          'make_plots_backgrounds' : self.make_plots_backgrounds
                        }
                        self.createCfg_makePlots(
                            self.jobOptions_make_plots[key_makePlots_job])

        self.sbatchFile_analyze = os.path.join(
            self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
        self.sbatchFile_comp_jetToTauFakeRate = os.path.join(
            self.dirs[DKEY_SCRIPTS], "sbatch_comp_jetToTauFakeRate.py")
        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable_analyze)
            self.createScript_sbatch_analyze(self.executable_analyze,
                                             self.sbatchFile_analyze,
                                             self.jobOptions_analyze)
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable_comp_jetToTauFakeRate)
            self.createScript_sbatch(self.executable_comp_jetToTauFakeRate,
                                     self.sbatchFile_comp_jetToTauFakeRate,
                                     self.jobOptions_comp_jetToTauFakeRate)

        lines_makefile = []
        self.addToMakefile_analyze(lines_makefile)
        self.addToMakefile_hadd_stage1(lines_makefile)
        self.addToMakefile_hadd_stage2(lines_makefile,
                                       make_dependency="phony_hadd_stage1",
                                       max_mem='4096M')
        self.addToMakefile_comp_jetToTauFakeRate(lines_makefile)
        self.addToMakefile_comp_hadd(lines_makefile)
        self.addToMakefile_make_plots(lines_makefile)
        self.createMakefile(lines_makefile)

        logging.info("Done.")

        return self.num_jobs
Esempio n. 9
0
def validate(output_dir, verbose=False):
    '''Validates the job execution carried out by dump_rle_parallel()
  Args:
    output_dir: string, The directory where all RLE files are stored
    verbose:    bool,   Enable verbose output

  Returns:
    None

  The validation is quite basic: the program will loop over the subdirectories of output_dir,
  matches them against the dictionary entries specified by sample variable and counts the number
  of lines in each RLE file. If the number of files doesn't match to the number of entries in
  the corresponding ROOT file, the user will be notified about such discrepancies.

  In principle, the script could also print relevant commands to fix the issues (and dump them
  to an easily executable file) but let's leave it for another time.
  '''

    if verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    root_file_regex = re.compile('^tree_(\d+).root$')
    file_dict = {k: [] for k in ['excess', 'missing', 'corrupted']}

    try:

        for s_key, s_value in samples.iteritems():
            sample_name = s_value['process_name_specific']
            sample_dir = os.path.join(output_dir, sample_name)
            if os.path.isdir(sample_dir):
                logging.debug("Found sample directory {sample_dir}".format(
                    sample_dir=sample_dir))

                #NB! assume that there are no secondary paths in the dictionary (hence index 0!)
                sample_path_dict = s_value['local_paths'][0]
                sample_path = sample_path_dict['path']
                blacklist = sample_path_dict['blacklist']
                for sample_subdir in os.listdir(sample_path):
                    sample_subpath_idx = -1
                    try:
                        sample_subpath_idx = int(sample_subdir)
                    except ValueError:
                        continue
                    if sample_subpath_idx < 0:
                        raise ValueError("Internal error")
                    sample_subpath = os.path.join(sample_path, sample_subdir)
                    logging.debug(
                        "Processing sample subdirectory {sample_subpath}".
                        format(sample_subpath=sample_subpath))

                    for sample_file in os.listdir(sample_subpath):
                        sample_file_fullpath = os.path.join(
                            sample_subpath, sample_file)
                        if not sample_file.endswith(
                                '.root') or not os.path.isfile(
                                    sample_file_fullpath):
                            continue

                        root_file_regex_match = root_file_regex.search(
                            sample_file)
                        if not root_file_regex_match:
                            continue

                        root_file_idx = int(root_file_regex_match.group(1))
                        expected_rle_file_basename = '{root_file_idx}.txt'.format(
                            root_file_idx=root_file_idx)
                        expected_rle_file = os.path.join(
                            sample_dir, expected_rle_file_basename)
                        file_dict_entry = (expected_rle_file,
                                           sample_file_fullpath)
                        if root_file_idx in blacklist:
                            if os.path.isfile(expected_rle_file):
                                logging.warning(
                                    'Found RLE file {rle_file} (corresponding to blacklisted {root_file}) '
                                    'which you ought to delete'.format(
                                        rle_file=expected_rle_file,
                                        root_file=sample_file_fullpath,
                                    ))
                            file_dict['excess'].append(file_dict_entry)
                            continue

                        if not os.path.isfile(expected_rle_file):
                            logging.warning(
                                'Missing RLE file {rle_file} (corresponding to {root_file})'
                                .format(
                                    rle_file=expected_rle_file,
                                    root_file=sample_file_fullpath,
                                ))
                            file_dict['missing'].append(file_dict_entry)
                            continue
                        nof_rle_events = raw_linecount(expected_rle_file)
                        if nof_rle_events == 1 and os.path.getsize(
                                expected_rle_file) == 1:
                            # the RLE file contains only a newline, hence no events
                            nof_rle_events = 0

                        root_file = ROOT.TFile(sample_file_fullpath, 'read')
                        root_tree = root_file.Get('tree')
                        nof_entries = root_tree.GetEntries()

                        nof_events_diff = nof_rle_events - nof_entries
                        if nof_events_diff < 0:
                            logging.error(
                                'Missing {nof_events} events in {rle_filename} (corresponding to {sample_file}): '
                                'expected {expected}, got {actual}'.format(
                                    nof_events=abs(nof_events_diff),
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                    expected=nof_entries,
                                    actual=nof_rle_events,
                                ))
                            file_dict['corrupted'].append(file_dict_entry)
                        elif nof_events_diff > 0:
                            logging.error(
                                'Got {nof_events} more event than expected in {rle_filename} (corresponding '
                                'to {sample_file}): expected {expected}, got {actual}'
                                .format(
                                    nof_events=nof_events_diff,
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                    expected=nof_entries,
                                    actual=nof_rle_events,
                                ))
                            file_dict['corrupted'].append(file_dict_entry)
                        else:
                            logging.debug(
                                'File {rle_filename} (corresponding to {sample_file}) looks OK'
                                .format(
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                ))

    except KeyboardInterrupt:
        pass

    if any(map(bool, file_dict.values())):
        logging.info('Validation finished with errors')
        for key in file_dict.keys():
            if file_dict[key]:
                logging.info('Number of {key} RLE files: {nof_key}'.format(
                    key=key, nof_key=len(file_dict[key])))
                for entry in file_dict[key]:
                    logging.info('{rle_file} <=> {sample_file}'.format(
                        rle_file=entry[0], sample_file=entry[1]))
    else:
        logging.info('Validation finished successfully')
    return
  def create(self):
    """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system
    """

    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue
      process_name = sample_info["process_name_specific"]
      sample_category = sample_info["sample_category"]
      is_mc = (sample_info["type"] == "mc")

      logging.info("Building dictionaries for sample %s..." % process_name)
      central_or_shift_extensions = ["", "hadd", "addBackgrounds"]
      central_or_shifts_extended = central_or_shift_extensions + self.central_or_shifts
      for central_or_shift_or_dummy in central_or_shifts_extended:
        process_name_extended = [ process_name, "hadd" ]
        for process_name_or_dummy in process_name_extended:
          if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]:
            if not is_mc:
              continue
            if not self.accept_central_or_shift(central_or_shift_or_dummy, sample_info):
              continue
          
          key_dir = getKey(process_name_or_dummy, central_or_shift_or_dummy)
          for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES ]:
            initDict(self.dirs, [ key_dir, dir_type ])
            if dir_type in [ DKEY_CFGS, DKEY_LOGS ]:
              self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, process_name_or_dummy, central_or_shift_or_dummy)
            else:
              self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, process_name_or_dummy)
    for subdirectory in [ "addBackgrounds", "prepareDatacards" ]:
      key_dir = getKey(subdirectory)
      for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]:
        initDict(self.dirs, [ key_dir, dir_type ])
        if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]:
          self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory)
        else:
          self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory)
    for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_HIST, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_COMBINE_OUTPUT ]:
      initDict(self.dirs, [ dir_type ])
      if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_HADD_RT, DKEY_PLOT, DKEY_COMBINE_OUTPUT ]:
        self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel)
      else:
        self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel)

    numDirectories = 0
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        numDirectories += len(self.dirs[key])
      else:
        numDirectories += 1
    logging.info("Creating directory structure (numDirectories = %i)" % numDirectories)
    numDirectories_created = 0;
    frac = 1
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        for dir_type in self.dirs[key].keys():
          create_if_not_exists(self.dirs[key][dir_type])
        numDirectories_created += len(self.dirs[key])
      else:
        create_if_not_exists(self.dirs[key])
        numDirectories_created = numDirectories_created + 1
      while 100*numDirectories_created >= frac*numDirectories:
        logging.info(" %i%% completed" % frac)
        frac = frac + 1
    logging.info("Done.")

    inputFileLists = {}
    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue
      logging.info("Checking input files for sample %s" % sample_info["process_name_specific"])
      inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job)

    self.inputFileIds = {}
    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue

      process_name = sample_info["process_name_specific"]

      logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name))
      inputFileList = inputFileLists[sample_name]

      is_mc = (sample_info["type"] == "mc")
      sample_category = sample_info["sample_category"]

      for central_or_shift in self.central_or_shifts:
        if central_or_shift != "central" and not is_mc:
          continue

        if not self.accept_central_or_shift(central_or_shift, sample_info):
          continue

        key_analyze_dir = getKey(process_name, central_or_shift)

        for jobId in inputFileList.keys():

          analyze_job_tuple = (process_name, central_or_shift, jobId)
          key_analyze_job = getKey(*analyze_job_tuple)
          ntupleFiles = inputFileList[jobId]
          if len(ntupleFiles) == 0:
            logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job))
            continue

          rleOutputFile = os.path.join(
            self.dirs[key_analyze_dir][DKEY_RLES],
            "rle_{channel}_{process_name}_{central_or_shift}_{jobId}_%s_%s.txt".format(
              channel          = self.channel,
              process_name     = process_name,
              central_or_shift = central_or_shift,
              jobId            = jobId,
            )) if self.select_rle_output else ""

          cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%i_cfg.py" % analyze_job_tuple)
          logFile_path          = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%i.log" % analyze_job_tuple)
          histogramFile_path    = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%i.root" % analyze_job_tuple)
          self.jobOptions_analyze[key_analyze_job] = {
            'ntupleFiles'              : ntupleFiles,
            'cfgFile_modified'         : cfgFile_modified_path,
            'histogramFile'            : histogramFile_path,
            'selEventsFileName_output' : rleOutputFile,
            'logFile'                  : logFile_path,
            'absEtaBins_e'             : self.absEtaBins_e,
            'ptBins_e'                 : self.ptBins_e,
            'absEtaBins_mu'            : self.absEtaBins_mu,
            'ptBins_mu'                : self.ptBins_mu,
            'central_or_shift'         : central_or_shift,
            'fillGenEvtHistograms'     : self.fillGenEvtHistograms,
            'triggers_mu_cfg'          : "leptonFR_triggers['{}']['{}']".format(self.era, 'mu'),
            'triggers_e_cfg'           : "leptonFR_triggers['{}']['{}']".format(self.era, 'e'),
            'lep_mva_cut_e'            : float(self.lep_mva_cut_e),
            'lep_mva_cut_mu'           : float(self.lep_mva_cut_mu),
          }
          self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info)

          # initialize input and output file names for hadd_stage1
          key_hadd_stage1_dir = getKey(process_name)
          key_hadd_stage1_job = getKey(process_name)
          if not key_hadd_stage1_job in self.inputFiles_hadd_stage1:
            self.inputFiles_hadd_stage1[key_hadd_stage1_job] = []
          self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile'])
          self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST],
                                                                          "hadd_stage1_%s.root" % process_name)

    # initialize input and output file names for hadd_stage1_5
    key_hadd_stage1_5_dir = getKey("hadd")
    key_hadd_stage1_5_job = getKey('')
    if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5:
      self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = []
    for key_hadd_stage1_job in self.outputFile_hadd_stage1.keys():
      self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job])
    self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST], "hadd_stage1_5.root" )

    # sum fake contributions for the total of all MC samples
    # input processes: TTj,... ## HERE !!
    # output process: fakes_mc
    key_hadd_stage1_5_job = getKey('')
    key_addBackgrounds_dir = getKey("addBackgrounds")
    key_addBackgrounds_job_sum = getKey("fakes_mc")
    sample_categories = []
    sample_categories.extend(self.nonfake_backgrounds)
    sample_categories.extend(self.ttHProcs)
    processes_input = []
    for sample_category in sample_categories:
      processes_input.append("%sj" % sample_category)
      self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_sum] = {
        'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_cfg.py" % "fakes_mc"),
        'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s.root" % "fakes_mc"),
        'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s.log" % "fakes_mc"),
        'categories' : [
          "LeptonFakeRate/numerator/electrons_tight",
          "LeptonFakeRate/denominator/electrons_fakeable",
          "LeptonFakeRate/numerator/muons_tight",
          "LeptonFakeRate/denominator/muons_fakeable"
        ],
        'processes_input' : processes_input,
        'process_output' : "fakes_mc",
        'histogramsToCopy' : list(self.histograms_to_fit.keys()),
        'sysShifts' : []
      }
      self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_sum])

    # create configuration files to run 'addBackgrounds_LeptonFakeRate'
    key_addBackgrounds_job_leptonFR = getKey('')
    self.jobOptions_addBackgrounds_LeptonFakeRate[key_addBackgrounds_job_leptonFR] = {
      'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
      'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], os.path.basename(self.cfgFile_addBackgrounds_LeptonFakeRate)),
      'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackground_LeptonFakeRate.root"),
      'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(self.cfgFile_addBackgrounds_LeptonFakeRate.replace("_cfg.py", ".log")) ),
    }
    self.createCfg_addBackgrounds_LeptonFakeRate(self.jobOptions_addBackgrounds_LeptonFakeRate[key_addBackgrounds_job_leptonFR])

    # create configuration files to run 'addBackgrounds_Convs_LeptonFakeRate'
    key_addBackgrounds_job_conv = getKey('')
    self.jobOptions_addBackgrounds_Convs_LeptonFakeRate[key_addBackgrounds_job_conv] = {
      'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
      'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], os.path.basename(self.cfgFile_addBackgrounds_Convs_LeptonFakeRate)),
      'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackground_Convs_LeptonFakeRate.root"),
      'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(self.cfgFile_addBackgrounds_Convs_LeptonFakeRate.replace("_cfg.py", ".log")) ),
    }
    self.createCfg_addBackgrounds_Convs_LeptonFakeRate(self.jobOptions_addBackgrounds_Convs_LeptonFakeRate[key_addBackgrounds_job_conv])


    # initialize input and output file names for hadd_stage2
    key_hadd_stage2_dir = getKey("hadd")
    key_hadd_stage2_job = getKey('')
    if not key_hadd_stage2_job in self.inputFiles_hadd_stage2:
      self.inputFiles_hadd_stage2[key_hadd_stage2_job] = []
    # CV: hadd_stage_1_5 output file does not need to be added as input for hadd_stage_2,
    #     as addBackgrounds_LeptonFakeRate output file contains all histograms except fakes_mc
    self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_sum]['outputFile'])
    self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_LeptonFakeRate[key_addBackgrounds_job_leptonFR]['outputFile'])
    self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_Convs_LeptonFakeRate[key_addBackgrounds_job_conv]['outputFile'])
    self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2.root")

    # We need to generate the eta and pt bins for electrons and muons
    lepton_bins = {}
    categories = []
    for lepton in ['electron', 'muon']:
      if lepton not in lepton_bins:
        lepton_bins[lepton] = {}

      absEtaBins = None
      ptBins = None
      lepton_short = None
      if lepton == 'electron':
        absEtaBins = self.absEtaBins_e
        ptBins = self.ptBins_e
        lepton_short = 'e'
      elif lepton == 'muon':
        absEtaBins = self.absEtaBins_mu
        ptBins = self.ptBins_mu
        lepton_short = 'mu'
      else:
        raise ValueError('Invalid lepton type: %s' % lepton)
      for selection in ['tight', 'fakeable']:
        if selection not in lepton_bins[lepton]:
          lepton_bins[lepton][selection] = []
        num_or_den = None
        if selection == 'tight':
          num_or_den = 'numerator'
        elif selection == 'fakeable':
          num_or_den = 'denominator'
        else:
          raise ValueError('Invalid lepton selection: %s' % selection)
        for absEtaBin_idx in range(0, len(absEtaBins) - 1):
          absEtaBinLowerEdge = absEtaBins[absEtaBin_idx]
          absEtaBinUpperEdge = absEtaBins[absEtaBin_idx + 1]
          absEtaBinString = getEtaBin(absEtaBinLowerEdge, absEtaBinUpperEdge)
          for ptBin_idx in range(0, len(ptBins) - 1):
            ptBinsLowerEdge = ptBins[ptBin_idx]
            ptBinsUpperEdge = ptBins[ptBin_idx + 1]
            ptBinString = getPtBin(ptBinsLowerEdge, ptBinsUpperEdge)
            absEta_and_ptBinString = '%s_%s' % (absEtaBinString, ptBinString)

            lepton_bins[lepton][selection].append(
              construct_lepton_params(
                lepton, lepton_short, selection, absEta_and_ptBinString,
                error_msg = "No fit parameter range specified for abs(eta) range = (%.3f, %.3f) and "
                            "pT range = (%.3f, %.3f) for lepton type '%s' !!" % \
                            (absEtaBinLowerEdge, absEtaBinUpperEdge, ptBinsLowerEdge, ptBinsUpperEdge, lepton)
              ) + (absEtaBinLowerEdge, absEtaBinUpperEdge, ptBinsLowerEdge, ptBinsUpperEdge, 0)
            )

            categories.append(
              (
                "LeptonFakeRate/%s/%ss_%s/%s/%s" % (num_or_den, lepton, selection, absEtaBinString, ptBinString),
                "%ss_%s_%s_shapes" % (lepton, selection, absEta_and_ptBinString),
              )
            )

        # Let's also add inclusive category
        lepton_bins[lepton][selection].append(
          construct_lepton_params(
            lepton, lepton_short, selection, 'incl',
            error_msg = "No fit parameter range specified for lepton type %s" % lepton
          ) + (-1., -1., -1., -1., 1)
        )
        categories.append(
          (
            "LeptonFakeRate/%s/%ss_%s/incl" % (num_or_den, lepton, selection),
            "%ss_%s_incl_shapes" % (lepton, selection),
          )
        )
    lepton_bins_merged = []
    for lepton_type in lepton_bins:
      for lepton_selection in lepton_bins[lepton_type]:
        lepton_bins_merged.extend(lepton_bins[lepton_type][lepton_selection])

    if self.prep_dcard:
      logging.info("Creating configuration files to run 'prepareDatacards_LeptonFakeRate'")
      datacards = []
      for histogramToFit in self.histograms_to_fit:
        key_prep_dcard_dir = getKey("prepareDatacards")
        key_prep_dcard_job = getKey(histogramToFit)
        datacard = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s.root" % (histogramToFit))
        self.jobOptions_prep_dcard[key_prep_dcard_job] = {
          'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
          'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_LeptonFakeRate_%s_cfg.py" % histogramToFit),
          'datacardFile' : datacard,
          'histogramDir' : (self.histogramDir_prep_dcard),
          'histogramToFit' : histogramToFit,
          'label' : None,
          'categories' : categories,
        }
        datacards.append(datacard)
        self.createCfg_prep_dcard_LeptonFakeRate(self.jobOptions_prep_dcard[key_prep_dcard_job])

      # Create setupDatacards_LeptonFakeRate.py script from the template
      systematics_leptonFR = []
      for systematic in self.central_or_shifts:
        if systematic == 'central':
          continue
        systematic_name = systematic.replace('Up', '').replace('Down', '')
        if systematic_name not in systematics_leptonFR:
          systematics_leptonFR.append(systematic_name)
      setup_dcards_template_file = os.path.join(jinja_template_dir, 'setupDatacards_LeptonFakeRate.py.template')
      with open(setup_dcards_template_file, 'r') as setup_dcards_template_file_ptr:
        setup_dcards_template = setup_dcards_template_file_ptr.read()
      setup_dcards_script = jinja2.Template(setup_dcards_template).render(
        leptons           = lepton_bins_merged,
        central_or_shifts = systematics_leptonFR,
        signal_process    = "QCD" if self.use_QCD_fromMC else "data_fakes",
      )
      setup_dcards_script_path = os.path.join(self.dirs[DKEY_SCRIPTS], 'setupDatacards_LeptonFakeRate.py')
      logging.debug("writing setupDatacards_LeptonFakeRate script file = '%s'" % setup_dcards_script_path)
      with codecs.open(setup_dcards_script_path, "w", "utf-8") as setup_dcards_script_file:
        setup_dcards_script_file.write(setup_dcards_script)
        setup_dcards_script_file.flush()
        os.fsync(setup_dcards_script_file.fileno())
      add_chmodX(setup_dcards_script_path)

      if self.use_QCD_fromMC:
          postfit_plot_script_path = os.path.join(os.environ['CMSSW_BASE'], 'src/tthAnalysis/HiggsToTauTau/data/leptonFR/scripts/postFitPlot_fakes_from_mc.py')
          yieldtable_script_path   = os.path.join(os.environ['CMSSW_BASE'], 'src/tthAnalysis/HiggsToTauTau/data/leptonFR/scripts/yieldTable_fakes_from_mc.py')
      else:
          postfit_plot_script_path = os.path.join(os.environ['CMSSW_BASE'], 'src/tthAnalysis/HiggsToTauTau/data/leptonFR/scripts/postFitPlot_fakes_from_data.py')
          yieldtable_script_path   = os.path.join(os.environ['CMSSW_BASE'], 'src/tthAnalysis/HiggsToTauTau/data/leptonFR/scripts/yieldTable_fakes_from_data.py')

      # Create run_postFit.sh script from the template
      combine_output_dir = os.path.join(self.dirs[DKEY_COMBINE_OUTPUT], 'output')
      postfit_template_file = os.path.join(jinja_template_dir, 'run_postFit.sh.template')
      with open(postfit_template_file, 'r') as postfit_template_file_ptr:
        postfit_template = postfit_template_file_ptr.read()
      for lepton in ['electron', 'muon']:
        for selection in ['fakeable', 'tight']:
          is_num = selection == 'tight'
          for params in lepton_bins[lepton][selection]:
            l_array, l_range, l_sub_dir, l_eta_low, l_eta_high, l_pt_low, l_pt_high, l_is_inclusive = params
            postfit_script = jinja2.Template(postfit_template).render(
              new_cmssw_base         = self.cmssw_base_dir_combine,
              setup_dcards_script    = setup_dcards_script_path,
              postfit_plot_script    = postfit_plot_script_path,
              int_lumi_data          = self.lumi,
              yieldtable_script      = yieldtable_script_path,
              output_dir             = combine_output_dir,
              numerator_plotLabel    = self.numerator_plotLabel,
              denominator_plotLabel  = self.denominator_plotLabel,
              l_array                = l_array,
              l_range                = l_range,
              l_sub_dir              = l_sub_dir,
              l_eta_low              = l_eta_low,
              l_eta_high             = l_eta_high,
              l_pt_low               = l_pt_low,
              l_pt_high              = l_pt_high,
              l_is_inclusive         = l_is_inclusive,
              is_num                 = is_num,
              numerator_output_dir   = os.path.join(combine_output_dir, 'mlfit_LeptonFakeRate_%s' % self.numerator_histogram),
              denominator_output_dir = os.path.join(combine_output_dir, 'mlfit_LeptonFakeRate_%s' % self.denominator_histogram),
              selection              = selection,
              lepton_letter          = 'e' if lepton == 'electron' else 'mu',
              grep_value             = "QCD" if self.use_QCD_fromMC else "data_fakes",
            )
            postfit_script_path = os.path.join(
              self.dirs[DKEY_SCRIPTS],
              'mlfit_%s_%s.sh' % (self.numerator_histogram if is_num else self.denominator_histogram, l_array)
            )
            logging.debug("Writing run_postFit script file = '%s'" % postfit_script_path)
            with codecs.open(postfit_script_path, "w", "utf-8") as postfit_script_file:
              postfit_script_file.write(postfit_script)
              postfit_script_file.flush()
              os.fsync(postfit_script_file.fileno())
            add_chmodX(postfit_script_path)

      key_prep_dcard_dir = getKey("prepareDatacards")
      fit_value_file = os.path.join(combine_output_dir, 'fit_values.txt')
      makefile_template_file = os.path.join(jinja_template_dir, 'Makefile_postFit.template')
      makefile_template = open(makefile_template_file, 'r').read()
      makefile_templatized = jinja2.Template(makefile_template).render(
        new_cmssw_base = self.cmssw_base_dir_combine,
        setup_dcards_script = setup_dcards_script_path,
        numerator_histogram = self.numerator_histogram,
        denominator_histogram = self.denominator_histogram,
        scripts_dir = self.dirs[DKEY_SCRIPTS],
        numerator_datacard = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s.root" % self.numerator_histogram),
        denominator_datacard = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s.root" % self.denominator_histogram),
        output_dir = combine_output_dir,
        numerator_output_dir = os.path.join(combine_output_dir, 'mlfit_LeptonFakeRate_%s' % self.numerator_histogram),
        denominator_output_dir = os.path.join(combine_output_dir, 'mlfit_LeptonFakeRate_%s' % self.denominator_histogram),
        lepton_bins = lepton_bins,
        fit_values = fit_value_file,
      )
      makefile_path = os.path.join(self.dirs[DKEY_SCRIPTS], 'Makefile_postFit')
      logging.debug("Writing run_postFit script file = '%s'" % makefile_path)
      with codecs.open(makefile_path, "w", "utf-8") as makefile_path_file:
        makefile_path_file.write(makefile_templatized)
        makefile_path_file.flush()
        os.fsync(makefile_path_file.fileno())

      self.jobOptions_combine = {
        'inputFile'       : ' '.join(datacards),
        'outputFile'      : fit_value_file,
        'makefile_path'   : makefile_path,
        'logFile'         : os.path.join(self.dirs[DKEY_LOGS], 'postFit.log'),
      }

      key_comp_LeptonFakeRate = getKey('')
      leptonFR_final_output = os.path.join(combine_output_dir, 'leptonFakeRates.root')
      self.jobOptions_comp_LeptonFakeRate[key_comp_LeptonFakeRate] = {
        'inputFile'            : [ fit_value_file, self.outputFile_hadd_stage2[key_hadd_stage2_job] ],
        'outputFile'           : leptonFR_final_output,
        'absEtaBins_e'         : self.absEtaBins_e,
        'ptBins_e'             : self.ptBins_e,
        'absEtaBins_mu'        : self.absEtaBins_mu,
        'ptBins_mu'            : self.ptBins_mu,
        'logFile'              : os.path.join(self.dirs[DKEY_LOGS], os.path.basename(self.cfgFile_comp_LeptonFakeRate).replace('_cfg.py', '.log')),
        'cfgFile_modified'     : os.path.join(self.dirs[DKEY_CFGS], os.path.basename(self.cfgFile_comp_LeptonFakeRate)),
        'plots_outputFileName' : os.path.join(self.dirs[DKEY_PLOT], "comp_LeptonFakeRate.png")
      }
      self.createCfg_comp_LeptonFakeRate(self.jobOptions_comp_LeptonFakeRate[key_comp_LeptonFakeRate])
      self.targets.append(self.jobOptions_comp_LeptonFakeRate[key_comp_LeptonFakeRate]['outputFile'])

    self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_LeptonFakeRate.py")
    self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_LeptonFakeRate.py")
    self.sbatchFile_addBackgrounds_LeptonFakeRate = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_LeptonFakeRate.py")
    self.sbatchFile_addBackgrounds_Convs_LeptonFakeRate = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_Convs_LeptonFakeRate.py")
    self.sbatchFile_comp_LeptonFakeRate = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_comp_LeptonFakeRate.py")
    if self.is_sbatch:
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
      self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      self.createScript_sbatch(self.executable_addBackgrounds_recursively, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum)
      self.createScript_sbatch(self.executable_addBackgrounds_LeptonFakeRate, self.sbatchFile_addBackgrounds_LeptonFakeRate, self.jobOptions_addBackgrounds_LeptonFakeRate)
      self.createScript_sbatch(self.executable_addBackgrounds_LeptonFakeRate, self.sbatchFile_addBackgrounds_Convs_LeptonFakeRate, self.jobOptions_addBackgrounds_Convs_LeptonFakeRate)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_comp_LeptonFakeRate)
      self.createScript_sbatch(self.executable_comp_LeptonFakeRate, self.sbatchFile_comp_LeptonFakeRate, self.jobOptions_comp_LeptonFakeRate)

    lines_makefile = []
    self.addToMakefile_analyze(lines_makefile)
    self.addToMakefile_hadd_stage1(lines_makefile)
    self.addToMakefile_backgrounds_from_data(lines_makefile) ## this step now does both e Conv, data_fakes and fakes_mc computation
    # self.addToMakefile_backgrounds_from_MC(lines_makefile)
    self.addToMakefile_hadd_stage2(lines_makefile, make_dependency = " ".join([
      "phony_addBackgrounds_LeptonFakeRate", "phony_addBackgrounds_Convs_LeptonFakeRate", "phony_addBackgrounds_sum"
    ]))
    self.addToMakefile_prep_dcard(lines_makefile)
    self.addToMakefile_combine(lines_makefile)
    self.addToMakefile_comp_LeptonFakeRate(lines_makefile)
    self.createMakefile(lines_makefile)

    logging.info("Done.")

    return self.num_jobs
Esempio n. 11
0
    def create(self):
        """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue

            process_name = sample_info["process_name_specific"]
            is_mc = (sample_info["type"] == "mc")

            if not is_mc:
                continue

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable, process_name))

            inputFileList = generateInputFileList(sample_info,
                                                  self.max_files_per_job)
            key_dir = getKey(process_name)

            outputFile = os.path.join(self.dirs[key_dir][DKEY_HISTO],
                                      "%s.root" % process_name)
            if os.path.isfile(outputFile) and tools_is_file_ok(
                    outputFile, min_file_size=2000):
                logging.info('File {} already exists --> skipping job'.format(
                    outputFile))
                continue

            self.outputFiles[process_name] = {
                'inputFiles': [],
                'outputFile': outputFile
            }

            for jobId in inputFileList.keys():

                key_file = getKey(sample_name, jobId)

                self.inputFiles[key_file] = inputFileList[jobId]
                if len(self.inputFiles[key_file]) == 0:
                    logging.warning(
                        "ntupleFiles['%s'] = %s --> skipping job !!" %
                        (key_file, self.inputFiles[key_file]))
                    continue

                self.cfgFiles_puProfile[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "puProfile_%s_%i_cfg.txt" % (process_name, jobId))
                self.outputFiles_tmp[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_HISTO_TMP],
                    "histogram_%i.root" % jobId)
                self.logFiles_puProfile[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_LOGS],
                    "puProfile_%s_%i.log" % (process_name, jobId))
                self.scriptFiles_puProfile[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "puProfile_%s_%i_cfg.sh" % (process_name, jobId))
                self.jobOptions_sbatch[key_file] = {
                    'histName': process_name,
                    'inputFiles': self.inputFiles[key_file],
                    'cfgFile_path': self.cfgFiles_puProfile[key_file],
                    'outputFile': self.outputFiles_tmp[key_file],
                    'logFile': self.logFiles_puProfile[key_file],
                    'scriptFile': self.scriptFiles_puProfile[key_file],
                }
                self.createCfg_puProfile(self.jobOptions_sbatch[key_file])
                self.outputFiles[process_name]['inputFiles'].append(
                    self.outputFiles_tmp[key_file])

        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable)
            self.num_jobs['puProfile'] += self.createScript_sbatch(
                self.executable, self.sbatchFile_puProfile,
                self.jobOptions_sbatch)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_puProfile(lines_makefile)
        self.addToMakefile_hadd(lines_makefile)
        self.addToMakefile_plot(lines_makefile)
        self.addToMakefile_finalHadd(lines_makefile)
        self.createMakefile(lines_makefile)
        logging.info("Done")

        return self.num_jobs
  def create(self):
    """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system
    """

    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue

      process_name = sample_info["process_name_specific"]
      is_mc = (sample_info["type"] == "mc")

      logging.info("Building dictionaries for sample %s..." % process_name)
      for lepton_selection in self.lepton_selections:
        central_or_shift_extensions = ["", "hadd", "addBackgrounds"]
        central_or_shifts_extended = central_or_shift_extensions + self.central_or_shifts
        for central_or_shift_or_dummy in central_or_shifts_extended:
          process_name_extended = [ process_name, "hadd" ]
          for process_name_or_dummy in process_name_extended:
            if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]:
              continue
            if central_or_shift_or_dummy != "central" and central_or_shift_or_dummy not in central_or_shift_extensions:
              if not is_mc:
                continue
              if not self.accept_central_or_shift(central_or_shift_or_dummy, sample_info):
                continue

            key_dir = getKey(process_name_or_dummy, lepton_selection, central_or_shift_or_dummy)
            for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES ]:
              initDict(self.dirs, [ key_dir, dir_type ])
              if dir_type in [ DKEY_CFGS, DKEY_LOGS ]:
                self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel,
                  "_".join([ lepton_selection ]), process_name_or_dummy, central_or_shift_or_dummy)
              else:
                self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel,
                  "_".join([ lepton_selection ]), process_name_or_dummy)
    for subdirectory in [ "prepareDatacards" ]:
      key_dir = getKey(subdirectory)
      for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]:
        initDict(self.dirs, [ key_dir, dir_type ])
        if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]:
          self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory)
        else:
          self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory)
    for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_COMBINE_OUTPUT ]:
      initDict(self.dirs, [ dir_type ])
      if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_COMBINE_OUTPUT ]:
        self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel)
      else:
        self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel)

    numDirectories = 0
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        numDirectories += len(self.dirs[key])
      else:
        numDirectories += 1
    logging.info("Creating directory structure (numDirectories = %i)" % numDirectories)
    numDirectories_created = 0;
    frac = 1
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        for dir_type in self.dirs[key].keys():
          create_if_not_exists(self.dirs[key][dir_type])
        numDirectories_created += len(self.dirs[key])
      else:
        create_if_not_exists(self.dirs[key])
        numDirectories_created = numDirectories_created + 1
      while 100*numDirectories_created >= frac*numDirectories:
        logging.info(" %i%% completed" % frac)
        frac = frac + 1
    logging.info("Done.")

    inputFileLists = {}
    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue
      logging.info("Checking input files for sample %s" % sample_info["process_name_specific"])
      inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job)

    for lepton_selection in self.lepton_selections:
      for sample_name, sample_info in self.samples.items():
        if not sample_info["use_it"]:
          continue
        process_name = sample_info["process_name_specific"]
        
        logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name))
        is_mc = (sample_info["type"] == "mc")
        inputFileList = inputFileLists[sample_name]
        for central_or_shift in self.central_or_shifts:
          if central_or_shift != "central" and not is_mc:
            continue

          # build config files for executing analysis code
          key_analyze_dir = getKey(process_name, lepton_selection, central_or_shift)

          for jobId in inputFileList.keys():
            analyze_job_tuple = (process_name, lepton_selection, central_or_shift, jobId)
            key_analyze_job = getKey(*analyze_job_tuple)
            ntupleFiles = inputFileList[jobId]
            if len(ntupleFiles) == 0:
              logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job))
              continue

            cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple)
            logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple)
            rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \
                                 if self.select_rle_output else ""
            histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple)

            self.jobOptions_analyze[key_analyze_job] = {
              'ntupleFiles'              : ntupleFiles,
              'cfgFile_modified'         : cfgFile_modified_path,
              'histogramFile'            : histogramFile_path,
              'logFile'                  : logFile_path,
              'selEventsFileName_output' : rleOutputFile_path,
              'leptonSelection'          : lepton_selection,
              'applyFakeRateWeights'     : "disabled",
              'central_or_shift'         : central_or_shift,
            }
            self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info)

            # initialize input and output file names for hadd_stage1
            key_hadd_stage1_dir = getKey(process_name, lepton_selection)
            hadd_stage1_job_tuple = (process_name, lepton_selection)
            key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple)
            if not key_hadd_stage1_job in self.inputFiles_hadd_stage1.keys():
              self.inputFiles_hadd_stage1[key_hadd_stage1_job] = []
            self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile'])
            self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST],
                                                                            "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple)

        # initialize input and output file names for hadd_stage2
        key_hadd_stage1_job = getKey(process_name, lepton_selection)
        key_hadd_stage2_dir = getKey("hadd", lepton_selection)
        key_hadd_stage2_job = getKey(lepton_selection)
        if not key_hadd_stage2_job in self.inputFiles_hadd_stage2.keys():
          self.inputFiles_hadd_stage2[key_hadd_stage2_job] = []
        self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job])
        self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST],
                                                                        "hadd_stage2_%s.root" % lepton_selection)

    logging.info("Creating configuration files to run 'prepareDatacards'")
    processesToCopy = []
    for process in self.prep_dcard_processesToCopy:
      processesToCopy.append(process)
    self.prep_dcard_processesToCopy = processesToCopy
    processesToCopy = []
    for process in self.prep_dcard_signals:
      processesToCopy.append(process)
    self.prep_dcard_signals = processesToCopy
    for histogramToFit in self.histograms_to_fit:
      key_hadd_stage2_job = getKey("Tight")
      key_prep_dcard_dir = getKey("prepareDatacards")
      prep_dcard_job_tuple = (self.channel, histogramToFit)
      key_prep_dcard_job = getKey(histogramToFit)
      datacardFile = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple)
      self.jobOptions_prep_dcard[key_prep_dcard_job] = {
        'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple),
        'datacardFile' : datacardFile,
        'histogramDir' : self.histogramDir_prep_dcard,
        'histogramToFit' : histogramToFit,
        'label' : None
      }
      self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job])

      jobOptions_makefile = copy.deepcopy(self.jobOptions_postFit)
      jobOptions_makefile['fit_result'] = os.path.join(
        self.dirs[DKEY_COMBINE_OUTPUT], 'fit_{}'.format(histogramToFit), jobOptions_makefile['target']
      )
      jobOptions_makefile['hadd_stage2'] = self.outputFile_hadd_stage2[key_hadd_stage2_job]
      jobOptions_makefile['prepare_datacard'] = datacardFile
      jobOptions_makefile['data_datacard'] = os.path.join(
        self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_data_%s_%s.root" % prep_dcard_job_tuple
      )
      jobOptions_makefile['pseudodata_datacard'] = os.path.join(
        self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_pseudodata_%s_%s.root" % prep_dcard_job_tuple
      )
      jobOptions_makefile['makefile'] = os.path.join(
        self.dirs[DKEY_COMBINE_OUTPUT], 'Makefile_{}'.format(histogramToFit)
      )
      jobOptions_makefile['stdout'] = os.path.join(
        self.dirs[DKEY_COMBINE_OUTPUT], 'stdout_{}.log'.format(histogramToFit)
      )
      self.createCfg_postFit(jobOptions_makefile)

    self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
    if self.is_sbatch:
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
      self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)

    logging.info("Creating Makefile")
    lines_makefile = []
    self.addToMakefile_analyze(lines_makefile)
    self.addToMakefile_hadd_stage1(lines_makefile)
    self.addToMakefile_hadd_stage2(lines_makefile, make_dependency = "phony_hadd_stage1")
    self.addToMakefile_prep_dcard(lines_makefile)
    self.addToMakefile_postFit(lines_makefile)
    self.createMakefile(lines_makefile)

    logging.info("Done.")

    return self.num_jobs
Esempio n. 13
0
def plot(input_files, output_files, title, expected_neff, mode):
  histogram_dict = {}
  for sample_name, sample_entry in input_files.items():
    if not hdfs.isfile(sample_entry['input']):
      logging.error('Could not find file {}'.format(sample_entry['input']))
      continue
    root_file = ROOT.TFile.Open(sample_entry['input'], 'read')
    logging.debug('Opened file {}'.format(sample_entry['input']))
    root_directories = list(filter(
      lambda root_dir: root_dir != None, [
        root_file.Get(os.path.join(key.GetName(), mode, 'genEvt')) \
        for key in root_file.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile'
      ]
    ))
    if len(root_directories) != 1:
      raise RuntimeError('Expected single directory in %s' % sample_entry['input'])
    root_dir = root_directories[0]
    histogram_dirs = [
      root_dir.Get(key.GetName()) \
      for key in root_dir.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile'
    ]
    if len(histogram_dirs) != 1:
      raise RuntimeError(
        'Expected single directory containing lumiScale histograms in %s' % sample_entry['input']
      )
    histogram_dir = histogram_dirs[0]
    histograms = [
      key.GetName() for key in histogram_dir.GetListOfKeys() \
      if key.GetClassName().startswith('TH1') and 'lumiScale' in key.GetName()
    ]
    for histogram_name_actual in histograms:
      histogram_name = histogram_name_actual.replace('_lumiScale', '').replace('CMS_ttHl_', '') \
                       if histogram_name_actual != 'lumiScale' else 'central'
      histogram = histogram_dir.Get(histogram_name_actual).Clone()
      histogram.SetDirectory(0)
      if histogram.GetEntries() != sample_entry['nentries'] and mode == 'unbiased':
        raise RuntimeError('Expected {} entries from {} in file {}, but got {} entries'.format(
          sample_entry['nentries'], histogram_name, sample_entry['input'], histogram.GetEntries(),
        ))
      if histogram_name not in histogram_dict:
        histogram_dict[histogram_name] = {
          'histogram' : histogram,
          'nentries'  : histogram.GetEntries(),
          'nfiles'    : 1,
        }
      else:
        histogram_dict[histogram_name]['histogram'].Add(histogram)
        histogram_dict[histogram_name]['nentries'] += histogram.GetEntries()
        histogram_dict[histogram_name]['nfiles'] += 1

    root_file.Close()

  if not histogram_dict:
    logging.error('Could not find histograms for samples {}'.format(', '.join(list(input_files.keys()))))
    return

  if len(set(histogram_dict[histogram_name]['nfiles'] for histogram_name in histogram_dict)) != 1:
    raise RuntimeError(
      'Inconsistent number of files found for samples %s' % ', '.join(list(input_files.keys()))
    )
  if len(set(histogram_dict[histogram_name]['nentries'] for histogram_name in histogram_dict)) != 1:
    raise RuntimeError(
      'Inconsistent number of entries found in samples %s' % ', '.join(list(input_files.keys()))
    )

  min_y = -1
  max_y = -1
  nentries = -1
  for histograms in histogram_dict.values():
    histogram = histograms['histogram']
    y_content = histogram.GetBinContent(1)
    y_error   = histogram.GetBinError(1)

    y_down = y_content - y_error
    y_up   = y_content + y_error

    if min_y < 0:
      min_y = y_down
    if max_y < 0:
      max_y = y_up
    if y_down < min_y:
      min_y = y_down
    if y_up > max_y:
      max_y = y_up

    if nentries < 0:
      nentries = histograms['nentries']
    else:
      assert(nentries == histograms['nentries'])

    if not (y_down < expected_neff < y_up) and mode == 'unbiased':
      logging.warning(
        "Effective event count {} not within {} +- {}".format(expected_neff, y_content, y_error)
      )

  if mode == 'unbiased':
    min_y = min(min_y, expected_neff)
    max_y = max(max_y, expected_neff)
  diff = 0.2 * (max_y - min_y)
  min_y -= diff
  max_y += diff

  canvas = ROOT.TCanvas('c', 'c', 1200, 900)
  canvas.SetGrid()
  ROOT.gStyle.SetOptStat(0)

  legend = ROOT.TLegend(0.1, 0.7, 0.48, 0.9)
  legend.SetHeader('N_{eff} (%d entries)' % nentries)

  expected_histogram = None

  line_width = 3
  marker_style = 20
  fill_style = 4000

  lines = []

  for idx, histogram_name in enumerate(sorted(histogram_dict.keys())):
    histogram = histogram_dict[histogram_name]['histogram']
    color = 2 + idx

    histogram.SetTitle(title)
    histogram.SetAxisRange(min_y, max_y, "Y")
    histogram.SetLineColor(color)
    histogram.SetMarkerColor(color)
    histogram.SetLineWidth(line_width)
    histogram.SetMarkerStyle(marker_style)
    histogram.SetFillStyle(fill_style)
    histogram.Draw("l e1%s" % (" same" if idx > 0 else ""))

    y_content = histogram.GetBinContent(1)
    y_error   = histogram.GetBinError(1)
    y_up      = y_content + y_error
    y_down    = y_content - y_error

    bin_width  = histogram.GetBinWidth(1)
    bin_center = histogram.GetBinCenter(1)
    line_min_x = bin_center - bin_width / 4
    line_max_x = bin_center + bin_width / 4

    line_down = ROOT.TLine(line_min_x, y_down, line_max_x, y_down)
    line_down.SetLineColor(color)
    line_down.SetLineWidth(line_width)
    line_down.Draw()
    lines.append(line_down)

    line_up = ROOT.TLine(line_min_x, y_up, line_max_x, y_up)
    line_up.SetLineColor(color)
    line_up.SetLineWidth(line_width)
    line_up.Draw()
    lines.append(line_up)

    sig_digits = max(8 - int(math.ceil(math.log10(y_content))), 1) if y_content > 0. else 1
    leg_pattern = '%s (%.{}f #pm %.{}f)'.format(sig_digits, sig_digits)
    leg_name = leg_pattern % (histogram_name, y_content, y_error)
    legend.AddEntry(histogram, leg_name)

    logging.debug(
      'Effective event count for the sys unc option {} is {} +- {}'.format(
        histogram_name, y_content, y_error
      )
    )

    if not expected_histogram and mode == 'unbiased':
      expected_histogram = histogram.Clone()
      expected_histogram.Reset()
      expected_histogram.SetBinContent(1, expected_neff)
      expected_histogram.SetBinError(1, 0)
      expected_histogram.SetLineColor(ROOT.kBlack)
      expected_histogram.SetMarkerColor(ROOT.kBlack)
      expected_histogram.SetLineWidth(line_width)
      expected_histogram.SetMarkerStyle(marker_style)
      expected_histogram.SetLineStyle(9)
      expected_histogram.SetFillStyle(fill_style)

  if expected_histogram:
    logging.debug('Expecting {} events'.format(expected_neff))
    expected_histogram.Draw("e2 same")
    legend.AddEntry(expected_histogram, 'expected (%.1f)' % expected_neff)

  legend.Draw()

  for output_file in output_files:
    canvas.SaveAs(output_file)

  canvas.Close()
  legend.Delete()
  if expected_histogram:
    expected_histogram.Delete()
  for histogram_name in histogram_dict:
    histogram_dict[histogram_name]['histogram'].Delete()
  for line in lines:
    line.Delete()
Esempio n. 14
0
                continue
            fp = 'root://cms-xrd-global.cern.ch/%s' % file_cand[0]
            try:
                f = ROOT.TFile.Open(fp, 'read')
                f.Close()
            except Exception:
                break
            logging.debug('Selected file {} ({} events)'.format(
                file_cand[0], file_cand[1]))
            selected_files.append(fp)
            if args.nof_files > 0 and len(selected_files) >= args.nof_files:
                break
        dy_samples[dy_sample] = selected_files

    for dy_sample, files in dy_samples.items():
        if not files:
            logging.warning('Could not find a file for {}'.format(dy_sample))
            continue
        logging.info('Plotting {}'.format(dy_sample))
        output_file_basename = dy_sample.split('/')[1].replace('-', '_')
        if 'ext' in dy_sample:
            output_file_basename += '_ext'
        output_file_fullpath_wo_ext = os.path.join(args.output_dir,
                                                   output_file_basename)
        output_files = map(
            lambda ext: '{}.{}'.format(output_file_fullpath_wo_ext, ext),
            args.extensions)
        title = ' + '.join(
            map(lambda f: os.path.basename(f).replace('.root', ''), files))
        plot(files, output_files, title, output_file_basename)
Esempio n. 15
0
        logging.getLogger().setLevel(logging.DEBUG)

    rle_file = args.file
    sample_name = args.sample_name
    output_file = args.output
    grep_directory = args.directory
    grep_individually = args.all
    try:
        sample_name_re = re.compile(sample_name)
    except:
        logging.error(
            "Argument {arg} not a valid regex".format(arg=sample_name))
        sys.exit(1)

    if grep_individually and not grep_directory:
        logging.warning(
            'Option -a/--all has no effect unless you specify -d/--directory')

    if not hdfs.isfile(rle_file):
        logging.error("No such file: '{rle_filename}'".format(
            rle_filename=rle_file, ))
        sys.exit(1)

    if output_file and not hdfs.isdir(os.path.dirname(output_file)):
        logging.error(
            "Parent directory of '{output_file}' doesn't exist".format(
                output_file=output_file, ))
        sys.exit(1)

    if grep_directory and not hdfs.isdir(grep_directory):
        logging.error("Grep directory '{grep_directory}' doesn't exist".format(
            grep_directory=grep_directory, ))
    if era == "2016" and sample_name.startswith(("/DoubleMuon/", "/DoubleEG/", "/Tau/")) and \
       sample_name.find("PromptReco-v3") == -1:
        sample_info["use_it"] = False

if __name__ == '__main__':
    logging.info(
      "Running the jobs with the following systematic uncertainties enabled: %s" % \
      ', '.join(central_or_shifts)
    )

    if sample_filter:
        samples = filter_samples(samples, sample_filter)

    if tau_id_wp:
        logging.warning(
            "Changing hadTau_selection_denominator from {} to {}".format(
                hadTau_selection_denominator, tau_id_wp))
        hadTau_selection_denominator = tau_id_wp

    analysis = analyzeConfig_jetToTauFakeRate(
        configDir=os.path.join("/home", getpass.getuser(), "ttHAnalysis", era,
                               version),
        outputDir=os.path.join("/hdfs/local", getpass.getuser(), "ttHAnalysis",
                               era, version),
        executable_analyze="analyze_jetToTauFakeRate",
        samples=samples,
        charge_selections=["OS"],
        jet_minPt=20.,
        jet_maxPt=1.e+6,
        jet_minAbsEta=-1.,
        jet_maxAbsEta=2.3,
Esempio n. 17
0
    def create(self):
        """Creates all necessary config files and runs the MEM -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        # read the file in, sample-by-sample
        # build the dictionary recursively
        # add rle file also to generated cfg files
        # print integrations per job as well!
        # consider more than 1 file per jobs -- the jobs are splitted by MEM integration anyways

        rle_filters = self.get_filter() if self.rle_filter_file else {}
        statistics = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue

            if not os.path.exists(sample_info['local_paths'][0]['path']):
                logging.warning("Skipping sample {sample_name}".format(sample_name = sample_name))
                continue

            process_name = sample_info["process_name_specific"]
            logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_addMEM, process_name))
            is_mc = (sample_info["type"] == "mc")
            if self.rle_filter_file:
                assert(process_name in rle_filters)

            inputFileList = generateInputFileList(sample_info, self.max_files_per_job)
            # typically, the analysis ends here and starts looping b/c the smallest unit of work processes
            # at least one file; we need, however, to split the file into event ranges in such a way that
            # each job performs mem_integrations_per_job MEM integrations

            # so what we are going to do is to open each set of files in inputFileList, read the variable
            # requestMEM_*l_*tau and try to gather the event ranges such that each event range
            # performs up to mem_integrations_per_job integrations per job
            memEvtRangeDict = self.memJobList(inputFileList, rle_filters[process_name] if self.rle_filter_file else [])

            for jobId in memEvtRangeDict.keys():

                key_dir = getKey(sample_name)
                key_file = getKey(sample_name, jobId)

                self.inputFiles[key_file] = memEvtRangeDict[jobId]['input_fileset']

                # there should always be a job
                assert(self.inputFiles[key_file] > 0), "More than one input file: %s ?? !!" % \
                                                       ', '.join(self.inputFiles[key_file])

                #assert(len(self.inputFiles[key_file]) == 1), "There is more than one input file!"
                self.cfgFiles_addMEM_modified[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i_cfg.py" % (self.channel, process_name, jobId)
                )
                self.shFiles_addMEM_modified[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i.sh" % (self.channel, process_name, jobId)
                )
                self.outputFiles[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_NTUPLES], "%s_%i.root" % (process_name, jobId)
                )
                self.logFiles_addMEM[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_LOGS], "addMEM_%s_%s_%i.log" % (self.channel, process_name, jobId)
                )
                self.logFiles_addMEM[key_file] = get_log_version((self.logFiles_addMEM[key_file],))[0]
                self.createCfg_addMEM(
                    self.inputFiles[key_file],
                    memEvtRangeDict[jobId]['event_range'][0],
                    memEvtRangeDict[jobId]['event_range'][1],
                    self.outputFiles[key_file],
                    self.era,
                    sample_info["sample_category"],
                    is_mc,
                    self.cfgFiles_addMEM_modified[key_file],
                    memEvtRangeDict[jobId]['whitelist'],
                )

                # associate the output file with the fileset_id
                #UDPATE: ONE OUTPUT FILE PER SAMPLE!
                fileset_id = memEvtRangeDict[jobId]['fileset_id']
                hadd_output_dir = os.path.join(
                    self.dirs[key_dir][DKEY_FINAL_NTUPLES],
                    '%04d' % (fileset_id // 1000)
                )
                if not os.path.exists(hadd_output_dir):
                    os.makedirs(hadd_output_dir)
                hadd_output = os.path.join(
                    hadd_output_dir, '%s_%i.root' % ('tree', fileset_id) # UDPATE: ADDED
                    #hadd_output_dir, "tree.root" # UDPATE: REMOVED
                )
                if hadd_output not in self.hadd_records:
                    self.hadd_records[hadd_output] = {}
                    self.hadd_records[hadd_output]['output_files'] = []
                self.hadd_records[hadd_output]['fileset_id'] = fileset_id
                self.hadd_records[hadd_output]['output_files'].append(self.outputFiles[key_file])
                self.hadd_records[hadd_output]['process_name'] = process_name
                #self.filesToClean.append(self.outputFiles[key_file])

            # let's sum the number of integration per sample
            nofEntriesMap = {}
            for v in memEvtRangeDict.values():
                if v['fileset_id'] not in nofEntriesMap:
                    nofEntriesMap[v['fileset_id']] = {
                        'nof_entries' : v['nof_entries'],
                    }
            statistics[process_name] = {
                'nof_int'         : sum([entry['nof_int']         for entry in memEvtRangeDict.values()]),
                'nof_entries'     : sum([entry['nof_entries']     for entry in nofEntriesMap.values()]),
                'nof_events_pass' : sum([entry['nof_events_pass'] for entry in memEvtRangeDict.values()]),
                'nof_int_pass'    : sum([entry['nof_int_pass']    for entry in memEvtRangeDict.values()]),
                'nof_zero'        : sum([entry['nof_zero']        for entry in memEvtRangeDict.values()]),
                'nof_jobs'        : len(memEvtRangeDict),
            }

        if self.is_sbatch:
            logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addMEM)
            self.createScript_sbatch()

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_addMEM(lines_makefile)
        self.addToMakefile_hadd(lines_makefile)
        self.createMakefile(lines_makefile)

        ws_len = max([len(kk) + 1 for kk in statistics.keys()])
        total_nof_integrations_sum = sum(x['nof_int']            for x in statistics.values())
        total_nof_entires          = sum(x['nof_entries']        for x in statistics.values())
        total_nof_zero_int         = sum(x['nof_zero']           for x in statistics.values())
        total_nof_jobs             = sum(x['nof_jobs']           for x in statistics.values())
        total_nof_pass             = sum(x['nof_events_pass']    for x in statistics.values())
        total_nof_int_pass_avg     = float(sum(x['nof_int_pass'] for x in statistics.values())) / total_nof_pass
        total_nof_integrations_avg = float(total_nof_integrations_sum) / total_nof_entires
        total_nof_int_per_job = float(total_nof_integrations_sum) / total_nof_jobs
        for k, v in statistics.iteritems():
            if v['nof_entries'] == 0:
                int_per_event = 0.
                evt_pass = 0.
            else:
                int_per_event = float(v['nof_int']) / v['nof_entries']
                evt_pass = (100 * float(v['nof_events_pass']) / v['nof_entries'])
            if v['nof_events_pass'] == 0:
                nof_int_pass = 0.
            else:
                nof_int_pass = float(v['nof_int_pass']) / v['nof_events_pass']
            print('%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d (%.2f%%) evt pass; %.2f int/evt pass; %d evt 0int)' %
              (k,
               ' ' * (ws_len - len(k)),
               v['nof_int'],
               v['nof_entries'],
               v['nof_jobs'],
               int_per_event,
               v['nof_events_pass'],
               evt_pass,
               nof_int_pass,
               v['nof_zero'],
              )
            )
        print('%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d evt pass; %.2f int/evt pass; '
              '%.2f int/job pass; %d evt 0int)' %
          ('total',
           ' ' * (ws_len - len('total')),
           total_nof_integrations_sum,
           total_nof_entires,
           total_nof_jobs,
           total_nof_integrations_avg,
           total_nof_pass,
           total_nof_int_pass_avg,
           total_nof_int_per_job,
           total_nof_zero_int,
          )
        )

        if self.max_mem_integrations > 0 and total_nof_integrations_sum > self.max_mem_integrations:
            logging.error("Will not start the jobs (max nof integrations exceeded)!")
            return False
        else:
            logging.info("Done")
            return True
Esempio n. 18
0
  def create(self):
    """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system
    """

    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue

      process_name = sample_info["process_name_specific"]
      sample_category = sample_info["sample_category"]
      is_mc = (sample_info["type"] == "mc")

      logging.info("Building dictionaries for sample %s..." % process_name)
      for lepton_selection in self.lepton_selections:
        for lepton_frWeight in self.lepton_frWeights:
          if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"):
            continue
          if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight", "forBDTtraining" ]:
            continue

          lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight)
          for chargeSumSelection in self.chargeSumSelections:
            central_or_shift_extensions = ["", "hadd", "addBackgrounds"]
            central_or_shift_dedicated = self.central_or_shifts if self.runTHweights(sample_info) else self.central_or_shifts_external
            central_or_shifts_extended = central_or_shift_extensions + central_or_shift_dedicated
            for central_or_shift_or_dummy in central_or_shifts_extended:
              process_name_extended = [ process_name, "hadd" ]
              for process_name_or_dummy in process_name_extended:
                if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]:
                  continue

                if central_or_shift_or_dummy not in central_or_shift_extensions and not self.accept_systematics(
                    central_or_shift_or_dummy, is_mc, lepton_selection, chargeSumSelection, sample_info
                ):
                  continue

                key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, chargeSumSelection, central_or_shift_or_dummy)
                for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES, DKEY_SYNC ]:
                  if dir_type == DKEY_SYNC and not self.do_sync:
                    continue
                  initDict(self.dirs, [ key_dir, dir_type ])
                  if dir_type in [ DKEY_CFGS, DKEY_LOGS ]:
                    self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel,
                      "_".join([ lepton_selection_and_frWeight, chargeSumSelection ]), process_name_or_dummy, central_or_shift_or_dummy)
                  else:
                    self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel,
                      "_".join([ lepton_selection_and_frWeight, chargeSumSelection ]), process_name_or_dummy)
    for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "prepareDatacards", "addSystFakeRates", "makePlots" ]:
      key_dir = getKey(subdirectory)
      for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]:
        initDict(self.dirs, [ key_dir, dir_type ])
        if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]:
          self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory)
        else:
          self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory)
    for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]:
      if dir_type == DKEY_SYNC and not self.do_sync:
        continue
      initDict(self.dirs, [ dir_type ])
      if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]:
        self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel)
      else:
        self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel)

    numDirectories = 0
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        numDirectories += len(self.dirs[key])
      else:
        numDirectories += 1
    logging.info("Creating directory structure (numDirectories = %i)" % numDirectories)
    numDirectories_created = 0;
    frac = 1
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        for dir_type in self.dirs[key].keys():
          create_if_not_exists(self.dirs[key][dir_type])
        numDirectories_created += len(self.dirs[key])
      else:
        create_if_not_exists(self.dirs[key])
        numDirectories_created = numDirectories_created + 1
      while 100*numDirectories_created >= frac*numDirectories:
        logging.info(" %i%% completed" % frac)
        frac = frac + 1
    logging.info("Done.")

    inputFileLists = {}
    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue
      logging.info("Checking input files for sample %s" % sample_info["process_name_specific"])
      inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job)

    mcClosure_regex = re.compile('Fakeable_mcClosure_(?P<type>m|e)_wFakeRateWeights')
    for lepton_selection in self.lepton_selections:
      electron_selection = lepton_selection
      muon_selection = lepton_selection

      hadTauVeto_selection = "Tight"
      hadTauVeto_selection = "|".join([ hadTauVeto_selection, self.hadTauVeto_selection_part2 ])

      if lepton_selection == "forBDTtraining":
        electron_selection = "Loose"
        muon_selection = "Loose"
      elif lepton_selection == "Fakeable_mcClosure_e":
        electron_selection = "Fakeable"
        muon_selection = "Tight"
      elif lepton_selection == "Fakeable_mcClosure_m":
        electron_selection = "Tight"
        muon_selection = "Fakeable"

      for lepton_frWeight in self.lepton_frWeights:
        if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"):
          continue
        if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight", "forBDTtraining" ]:
          continue
        lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight)

        for chargeSumSelection in self.chargeSumSelections:

          for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
              continue
            process_name = sample_info["process_name_specific"]
            logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name))
            inputFileList = inputFileLists[sample_name]

            sample_category = sample_info["sample_category"]
            is_mc = (sample_info["type"] == "mc")
            use_th_weights = self.runTHweights(sample_info)

            central_or_shift_dedicated = self.central_or_shifts if use_th_weights else self.central_or_shifts_external
            for central_or_shift in central_or_shift_dedicated:
              if not self.accept_systematics(
                  central_or_shift, is_mc, lepton_selection, chargeSumSelection, sample_info
              ):
                continue

              central_or_shifts_local = []
              if central_or_shift == "central" and not use_th_weights:
                for central_or_shift_local in self.central_or_shifts_internal:
                  if self.accept_systematics(
                      central_or_shift_local, is_mc, lepton_selection, chargeSumSelection, sample_info
                  ):
                    central_or_shifts_local.append(central_or_shift_local)

              logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift))

              # build config files for executing analysis code
              key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, chargeSumSelection, central_or_shift)

              for jobId in inputFileList.keys():

                analyze_job_tuple = (process_name, lepton_selection_and_frWeight, chargeSumSelection, central_or_shift, jobId)
                key_analyze_job = getKey(*analyze_job_tuple)
                ntupleFiles = inputFileList[jobId]
                if len(ntupleFiles) == 0:
                  logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job))
                  continue

                syncOutput = ''
                syncTree = ''
                if self.do_sync:
                  if chargeSumSelection != 'OS':
                    continue
                  mcClosure_match = mcClosure_regex.match(lepton_selection_and_frWeight)
                  if lepton_selection_and_frWeight == 'Tight':
                    syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_SR.root' % (self.channel, central_or_shift))
                    syncTree   = 'syncTree_%s_SR' % self.channel.replace('_', '')
                  elif lepton_selection_and_frWeight == 'Fakeable_wFakeRateWeights':
                    syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Fake.root' % (self.channel, central_or_shift))
                    syncTree   = 'syncTree_%s_Fake' % self.channel.replace('_', '')
                  elif mcClosure_match:
                    mcClosure_type = mcClosure_match.group('type')
                    syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_mcClosure_%s.root' % (self.channel, central_or_shift, mcClosure_type))
                    syncTree = 'syncTree_%s_mcClosure_%s' % (self.channel.replace('_', ''), mcClosure_type)
                  else:
                    continue
                if syncTree and central_or_shift != "central":
                  syncTree = os.path.join(central_or_shift, syncTree)
                syncRLE = ''
                if self.do_sync and self.rle_select:
                  syncRLE = self.rle_select % syncTree
                  if not os.path.isfile(syncRLE):
                    logging.warning("Input RLE file for the sync is missing: %s; skipping the job" % syncRLE)
                    continue
                if syncOutput:
                  self.inputFiles_sync['sync'].append(syncOutput)

                cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % analyze_job_tuple)
                logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % analyze_job_tuple)
                rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%i.txt" % analyze_job_tuple) \
                                     if self.select_rle_output else ""
                histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%s_%i.root" % analyze_job_tuple)
                branchName_memOutput = '%s_%s' % (self.MEMbranch, self.get_addMEM_systematics(central_or_shift)) \
                                       if self.MEMbranch else ''

                self.jobOptions_analyze[key_analyze_job] = {
                  'ntupleFiles'              : ntupleFiles,
                  'cfgFile_modified'         : cfgFile_modified_path,
                  'histogramFile'            : histogramFile_path,
                  'logFile'                  : logFile_path,
                  'selEventsFileName_output' : rleOutputFile_path,
                  'electronSelection'        : electron_selection,
                  'muonSelection'            : muon_selection,
                  'apply_leptonGenMatching'  : self.apply_leptonGenMatching,
                  'hadTauSelection'          : hadTauVeto_selection,
                  'chargeSumSelection'       : chargeSumSelection,
                  'applyFakeRateWeights'     : self.applyFakeRateWeights if not lepton_selection == "Tight" else "disabled",
                  'central_or_shift'         : central_or_shift,
                  'central_or_shifts_local'  : central_or_shifts_local,
                  'selectBDT'                : self.isBDTtraining,
                  'branchName_memOutput'     : branchName_memOutput,
                  'syncOutput'               : syncOutput,
                  'syncTree'                 : syncTree,
                  'syncRLE'                  : syncRLE,
                  'apply_hlt_filter'         : self.hlt_filter,
                  'useNonNominal'            : self.use_nonnominal,
                  'fillGenEvtHistograms'     : True,
                  'isControlRegion'          : self.isControlRegion,
                }
                self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection)

                # initialize input and output file names for hadd_stage1
                key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight, chargeSumSelection)
                hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight, chargeSumSelection)
                key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple)
                if not key_hadd_stage1_job in self.inputFiles_hadd_stage1:
                  self.inputFiles_hadd_stage1[key_hadd_stage1_job] = []
                self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile'])
                self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST],
                                                                                "hadd_stage1_%s_%s_%s.root" % hadd_stage1_job_tuple)

            if self.isBDTtraining or self.do_sync:
              continue

            # add output files of hadd_stage1 for data to list of input files for hadd_stage1_5
            key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight, chargeSumSelection)
            key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight, chargeSumSelection)
            hadd_stage1_5_job_tuple = (lepton_selection_and_frWeight, chargeSumSelection)
            key_hadd_stage1_5_job = getKey(*hadd_stage1_5_job_tuple)
            if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5:
              self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = []
            self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job])
            self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST],
                                                                        "hadd_stage1_5_%s_%s.root" % hadd_stage1_5_job_tuple)

          if self.isBDTtraining or self.do_sync:
            continue

          ## doing list of processes to make the hadd in _Convs and _fake
          ## we could remove the tH ones with althernative couplings
          sample_categories = []
          sample_categories.extend(self.nonfake_backgrounds)
          sample_categories.extend(self.ttHProcs)
          processes_input_base = self.get_processes_input_base(sample_categories)

          # sum fake background contributions for the total of all MC sample
          # input processes: TT_fake, TTW_fake, TTWW_fake, ...
          # output process: fakes_mc
          key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, chargeSumSelection)
          key_addBackgrounds_dir = getKey("addBackgrounds")
          addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight, chargeSumSelection)
          key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple)
          processes_input = []
          for process_input_base in processes_input_base:
            if "HH" in process_input_base:
              continue
            processes_input.append("%s_fake" % process_input_base)
          self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = {
            'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
            'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple),
            'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_fakes_tuple),
            'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_fakes_tuple),
            'categories' : [ getHistogramDir(self.channel, lepton_selection, lepton_frWeight, chargeSumSelection) ],
            'processes_input' : processes_input,
            'process_output' : "fakes_mc"
          }
          self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes])

          # sum conversion background contributions for the total of all MC sample
          # input processes: TT_Convs, TTW_Convs, TTWW_Convs, ...
          # output process: Convs
          addBackgrounds_job_Convs_tuple = ("Convs", lepton_selection_and_frWeight, chargeSumSelection)
          key_addBackgrounds_job_Convs = getKey(*addBackgrounds_job_Convs_tuple)
          processes_input = []
          for process_input_base in self.convs_backgrounds:
            if "HH" in process_input_base:
              continue
            processes_input.append("%s_Convs" % process_input_base)
          self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs] = {
            'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
            'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_Convs_tuple),
            'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_Convs_tuple),
            'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_Convs_tuple),
            'categories' : [ getHistogramDir(self.channel, lepton_selection, lepton_frWeight, chargeSumSelection) ],
            'processes_input' : processes_input,
            'process_output' : "Convs"
          }
          self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs])

          # initialize input and output file names for hadd_stage2
          key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, chargeSumSelection)
          key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight, chargeSumSelection)
          hadd_stage2_job_tuple = (lepton_selection_and_frWeight, chargeSumSelection)
          key_hadd_stage2_job = getKey(*hadd_stage2_job_tuple)
          if not key_hadd_stage2_job in self.inputFiles_hadd_stage2:
            self.inputFiles_hadd_stage2[key_hadd_stage2_job] = []
          if lepton_selection == "Tight":
            self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'])
            self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs]['outputFile'])
          self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job])
          self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST],
                                                                          "hadd_stage2_%s_%s.root" % hadd_stage2_job_tuple)

    if self.isBDTtraining or self.do_sync:
      if self.is_sbatch:
        logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
        self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
        if self.isBDTtraining:
          self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
        elif self.do_sync:
          self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      logging.info("Creating Makefile")
      lines_makefile = []
      if self.isBDTtraining:
        self.addToMakefile_analyze(lines_makefile)
        self.addToMakefile_hadd_stage1(lines_makefile)
      elif self.do_sync:
        self.addToMakefile_syncNtuple(lines_makefile)
        outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel)
        self.outputFile_sync['sync'] = outputFile_sync_path
        self.addToMakefile_hadd_sync(lines_makefile)
      else:
        raise ValueError("Internal logic error")
      self.addToMakefile_validate(lines_makefile)
      self.targets.extend(self.phoniesToAdd)
      self.createMakefile(lines_makefile)
      logging.info("Done.")
      return self.num_jobs

    logging.info("Creating configuration files to run 'addBackgroundFakes'")
    for chargeSumSelection in self.chargeSumSelections:
      key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled"), chargeSumSelection)
      key_addFakes_dir = getKey("addBackgroundLeptonFakes")
      key_addFakes_job = getKey("data_fakes", chargeSumSelection)
      category_sideband = "{}_{}_Fakeable_wFakeRateWeights".format(self.channel, chargeSumSelection)
      self.jobOptions_addFakes[key_addFakes_job] = {
        'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_addFakes_dir][DKEY_CFGS], "addBackgroundLeptonFakes_%s_cfg.py" % chargeSumSelection),
        'outputFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_HIST], "addBackgroundLeptonFakes_%s.root" % chargeSumSelection),
        'logFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_LOGS], "addBackgroundLeptonFakes_%s.log" % chargeSumSelection),
        'category_signal' : "{}_{}_Tight".format(self.channel, chargeSumSelection),
        'category_sideband' : category_sideband
      }
      self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job])
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), chargeSumSelection)
      self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile'])

    logging.info("Creating configuration files to run 'prepareDatacards'")
    for histogramToFit in self.histograms_to_fit:
      key_prep_dcard_dir = getKey("prepareDatacards")
      if "OS" in self.chargeSumSelections:
        key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS")
        prep_dcard_job_tuple = (self.channel, "OS", histogramToFit)
        key_prep_dcard_job = getKey("OS", histogramToFit)
        self.jobOptions_prep_dcard[key_prep_dcard_job] = {
          'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
          'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_%s_cfg.py" % prep_dcard_job_tuple),
          'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s_%s.root" % prep_dcard_job_tuple),
          'histogramDir' : self.histogramDir_prep_dcard,
          'histogramToFit' : histogramToFit,
          'label' : None
        }
        self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job])

      if "SS" in self.chargeSumSelections:
        key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS")
        prep_dcard_job_tuple = (self.channel, "SS", histogramToFit)
        key_prep_dcard_job = getKey("SS", histogramToFit)
        self.jobOptions_prep_dcard[key_prep_dcard_job] = {
          'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
          'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_%s_cfg.py" % prep_dcard_job_tuple),
          'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s_%s.root" % prep_dcard_job_tuple),
          'histogramDir' : self.histogramDir_prep_dcard_SS,
          'histogramToFit' : histogramToFit,
          'label' : 'SS'
        }
        self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job])

      # add shape templates for the following systematic uncertainties:
      #  - 'CMS_ttHl_Clos_norm_e'
      #  - 'CMS_ttHl_Clos_shape_e'
      #  - 'CMS_ttHl_Clos_norm_m'
      #  - 'CMS_ttHl_Clos_shape_m'
      for chargeSumSelection in self.chargeSumSelections:
        key_prep_dcard_job = getKey(chargeSumSelection, histogramToFit)
        key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), chargeSumSelection)
        key_add_syst_fakerate_dir = getKey("addSystFakeRates")
        add_syst_fakerate_job_tuple = (self.channel, chargeSumSelection, histogramToFit)
        key_add_syst_fakerate_job = getKey(chargeSumSelection, histogramToFit)
        self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = {
          'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'],
          'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_%s_cfg.py" % add_syst_fakerate_job_tuple),
          'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s_%s.root" % add_syst_fakerate_job_tuple),
          'category' : self.channel,
          'histogramToFit' : histogramToFit,
          'plots_outputFileName' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_PLOT], "addSystFakeRates.png")
        }
        histogramDir_nominal = None
        if chargeSumSelection == "OS":
          histogramDir_nominal = self.histogramDir_prep_dcard
        elif chargeSumSelection == "SS":
          histogramDir_nominal = self.histogramDir_prep_dcard_SS
        else:
          raise ValueError("Invalid parameter 'chargeSumSelection' = %s !!" % chargeSumSelection)
        for lepton_type in [ 'e', 'm' ]:
          lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type
          if lepton_mcClosure not in self.lepton_selections:
            continue
          lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled")
          key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight, chargeSumSelection)
          histogramDir_mcClosure = self.mcClosure_dir['%s_%s' % (lepton_mcClosure, chargeSumSelection)]
          self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({
            'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections,
            'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job],
            'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit),
            'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'],
            'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit)
          })
        self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job])

    logging.info("Creating configuration files to run 'makePlots'")
    key_makePlots_dir = getKey("makePlots")
    if "OS" in self.chargeSumSelections:
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS")
      key_makePlots_job = getKey("OS")
      self.jobOptions_make_plots[key_makePlots_job] = {
        'executable' : self.executable_make_plots,
        'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel),
        'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel),
        'histogramDir' : self.histogramDir_prep_dcard,
        'label' : self.channel,
        'make_plots_backgrounds' : self.make_plots_backgrounds
      }
      self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job])
    if "SS" in self.chargeSumSelections:
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS")
      key_makePlots_job = getKey("SS")
      self.jobOptions_make_plots[key_makePlots_job] = {
        'executable' : self.executable_make_plots,
        'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_SS_cfg.py" % self.channel),
        'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_SS.png" % self.channel),
        'histogramDir' : self.histogramDir_prep_dcard_SS,
        'label' : "{} SS".format(self.channel),
        'make_plots_backgrounds' : self.make_plots_backgrounds
      }
      self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job])
    if "Fakeable_mcClosure" in self.lepton_selections: #TODO
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS")
      key_makePlots_job = getKey("OS")
      self.jobOptions_make_plots[key_makePlots_job] = {
        'executable' : self.executable_make_plots_mcClosure,
        'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_mcClosure_%s_cfg.py" % self.channel),
        'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_mcClosure_%s.png" % self.channel)
      }
      self.createCfg_makePlots_mcClosure(self.jobOptions_make_plots[key_makePlots_job])

    self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
    self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel)
    self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel)
    self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel)
    if self.is_sbatch:
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
      self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds)
      self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds)
      self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes)
      self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes)

    logging.info("Creating Makefile")
    lines_makefile = []
    self.addToMakefile_analyze(lines_makefile)
    self.addToMakefile_hadd_stage1(lines_makefile)
    self.addToMakefile_backgrounds_from_data(lines_makefile)
    self.addToMakefile_hadd_stage2(lines_makefile)
    self.addToMakefile_prep_dcard(lines_makefile)
    self.addToMakefile_add_syst_fakerate(lines_makefile)
    self.addToMakefile_make_plots(lines_makefile)
    self.addToMakefile_validate(lines_makefile)
    self.createMakefile(lines_makefile)

    logging.info("Done.")

    return self.num_jobs
Esempio n. 19
0
    def submitJob(
        self,
        inputFiles,
        executable,
        command_line_parameter,
        outputFilePath,
        outputFiles,
        scriptFile,
        logFile=None,
        skipIfOutputFileExists=False,
        job_template_file='sbatch-node.sh.template',
        copy_output_file=True,
        nof_submissions=0,
    ):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """

        logging.debug("<sbatchManager::submitJob>: job_template_file = '%s'" %
                      job_template_file)

        job_template_file = os.path.join(jinja_template_dir, job_template_file)
        job_template = open(job_template_file, 'r').read()

        # raise if logfile missing
        if not logFile:
            if not self.logFileDir:
                raise ValueError(
                    "Please call 'setLogFileDir' before calling 'submitJob' !!"
                )
            logFile = os.path.join(
                self.logFileDir,
                os.path.basename(scriptFile).replace(".sh", ".log"))

        # skip only if none of the output files are missing in the file system
        outputFiles_fullpath = map(
            lambda outputFile: os.path.join(outputFilePath, outputFile),
            outputFiles)
        if skipIfOutputFileExists:
            outputFiles_missing = [
                outputFile for outputFile in outputFiles_fullpath \
                if not is_file_ok(outputFile, validate_outputs = True, min_file_size = self.min_file_size)
            ]
            if not outputFiles_missing:
                logging.debug(
                  "output file(s) = %s exist(s) --> skipping !!" % \
                  '; '.join(map(lambda x: "'%s'" % x, outputFiles_fullpath))
                )
                return

        if not self.workingDir:
            raise ValueError(
                "Please call 'setWorkingDir' before calling 'submitJob' !!")

        if not self.cmssw_base_dir:
            logging.warning("cmssw_base_dir not set, setting it to '%s'" %
                            os.environ.get('CMSSW_BASE'))
            self.cmssw_base_dir = os.environ.get('CMSSW_BASE')

        job_dir = self.get_job_dir()

        # create script for executing jobs
        wrapper_log_file = logFile.replace('.log', '_wrapper.log')
        executable_log_file = logFile.replace('.log', '_executable.log')
        wrapper_log_file, executable_log_file = get_log_version(
            (wrapper_log_file, executable_log_file))

        sbatch_command = "sbatch --partition={partition} --output={output} --comment='{comment}' " \
                         "{max_mem} {args} {cmd}".format(
          partition = self.queue,
          output    = wrapper_log_file,
          comment   = self.pool_id,
          args      = self.sbatchArgs,
          cmd       = scriptFile,
          max_mem   = '--mem={}'.format(self.max_mem) if self.max_mem else '',
        )

        two_pow_sixteen = 65536
        random.seed((abs(hash(command_line_parameter))) % two_pow_sixteen)
        max_delay = 60
        random_delay = random.randint(0, max_delay)

        script = jinja2.Template(job_template).render(
            working_dir=self.workingDir,
            cmssw_base_dir=self.cmssw_base_dir,
            job_dir=job_dir,
            job_template_file=job_template_file,
            exec_name=executable,
            command_line_parameter=command_line_parameter,
            inputFiles=" ".join(inputFiles),
            outputDir=outputFilePath,
            outputFiles=" ".join(outputFiles),
            wrapper_log_file=wrapper_log_file,
            executable_log_file=executable_log_file,
            script_file=scriptFile,
            RUNNING_COMMAND=sbatch_command,
            random_sleep=random_delay,
            copy_output_file=copy_output_file,
        )
        logging.debug("writing sbatch script file = '%s'" % scriptFile)
        with codecs.open(scriptFile, "w", "utf-8") as f:
            f.write(script)
            f.flush()
            os.fsync(f.fileno())

        if self.dry_run:
            return

        nof_submissions += 1
        job = {
            'sbatch_command':
            sbatch_command,
            'status':
            Status.in_queue,
            'log_wrap':
            wrapper_log_file,
            'log_exec':
            executable_log_file,
            'args': (
                inputFiles,
                executable,
                command_line_parameter,
                outputFilePath,
                outputFiles,
                scriptFile,
                logFile,
                skipIfOutputFileExists,
                job_template_file,
                nof_submissions,
            ),
            'nof_submissions':
            nof_submissions,
            'outputFiles':
            outputFiles_fullpath,
        }
        self.queuedJobs.append(job)
Esempio n. 20
0
  def __init__(self,
        configDir,
        outputDir,
        executable_analyze,
        cfgFile_analyze,
        samples,
        hadTauVeto_selection,
        applyFakeRateWeights,
        central_or_shifts,
        max_files_per_job,
        era,
        use_lumi,
        lumi,
        check_output_files,
        running_method,
        num_parallel_jobs,
        executable_addBackgrounds,
        executable_addBackgroundJetToTauFakes,
        histograms_to_fit,
        select_rle_output         = False,
        executable_prep_dcard     = "prepareDatacards",
        executable_add_syst_dcard = "addSystDatacards",
        verbose                   = False,
        hlt_filter                = False,
        dry_run                   = False,
        isDebug                   = False,
        use_home                  = True,
        do_sync                   = False,
        rle_select                = '',
        use_nonnominal            = False,
      ):
    analyzeConfig.__init__(self,
      configDir                 = configDir,
      outputDir                 = outputDir,
      executable_analyze        = executable_analyze,
      channel                   = "ttZctrl",
      samples                   = samples,
      central_or_shifts         = central_or_shifts,
      max_files_per_job         = max_files_per_job,
      era                       = era,
      use_lumi                  = use_lumi,
      lumi                      = lumi,
      check_output_files        = check_output_files,
      running_method            = running_method,
      num_parallel_jobs         = num_parallel_jobs,
      histograms_to_fit         = histograms_to_fit,
      triggers                  = [ '1e', '1mu', '2e', '2mu', '1e1mu' ],
      executable_prep_dcard     = executable_prep_dcard,
      executable_add_syst_dcard = executable_add_syst_dcard,
      verbose                   = verbose,
      dry_run                   = dry_run,
      isDebug                   = isDebug,
      use_home                  = use_home,
      do_sync                   = do_sync,
    )

    self.lepton_selections = [ "Tight", "Fakeable" ]
    self.lepton_frWeights = [ "enabled", "disabled" ]
    self.hadTauVeto_selection_part2 = hadTauVeto_selection
    self.applyFakeRateWeights = applyFakeRateWeights
    run_mcClosure = 'central' not in self.central_or_shifts or len(central_or_shifts) > 1 or self.do_sync
    if self.era != '2017':
      logging.warning('mcClosure for lepton FR not possible for era %s' % self.era)
      run_mcClosure = False
    if run_mcClosure:
      # Run MC closure jobs only if the analysis is run w/ (at least some) systematic uncertainties
      # self.lepton_and_hadTau_selections.extend([ "Fakeable_mcClosure_all" ]) #TODO
      pass

    self.lepton_genMatches = [ "3l0g0j", "2l1g0j", "2l0g1j", "1l2g0j", "1l1g1j", "1l0g2j", "0l3g0j", "0l2g1j", "0l1g2j", "0l0g3j" ]

    self.apply_leptonGenMatching = None
    self.lepton_genMatches_nonfakes = []
    self.lepton_genMatches_conversions = []
    self.lepton_genMatches_fakes = []
    if applyFakeRateWeights == "3lepton":
      self.apply_leptonGenMatching = True
      for lepton_genMatch in self.lepton_genMatches:
        if lepton_genMatch.endswith("0g0j"):
          self.lepton_genMatches_nonfakes.append(lepton_genMatch)
        elif lepton_genMatch.endswith("0j"):
          self.lepton_genMatches_conversions.append(lepton_genMatch)
        else:
          self.lepton_genMatches_fakes.append(lepton_genMatch)
      if run_mcClosure:
        self.lepton_selections.extend([ "Fakeable_mcClosure_e", "Fakeable_mcClosure_m" ])
    else:
      raise ValueError("Invalid Configuration parameter 'applyFakeRateWeights' = %s !!" % applyFakeRateWeights)

    self.executable_addBackgrounds = executable_addBackgrounds
    self.executable_addFakes = executable_addBackgroundJetToTauFakes

    self.nonfake_backgrounds = [ "TT", "TTW", "TTZ", "TTWW", "EWK", "Rares", "tHq", "tHW", "VH" ]

    self.cfgFile_analyze = os.path.join(self.template_dir, cfgFile_analyze)
    self.prep_dcard_processesToCopy = [ "data_obs" ] + self.nonfake_backgrounds + [ "conversions", "fakes_data", "fakes_mc" ]
    self.histogramDir_prep_dcard = "ttZctrl_Tight"
    self.make_plots_backgrounds = [ "TTW", "TTZ", "TTWW", "EWK", "Rares", "tHq", "tHW" ] + [ "conversions", "fakes_data" ]
    self.cfgFile_make_plots = os.path.join(self.template_dir, "makePlots_ttZctrl_cfg.py")
    self.make_plots_signal = "TTZ"

    self.select_rle_output = select_rle_output
    self.rle_select = rle_select
    self.use_nonnominal = use_nonnominal
    self.hlt_filter = hlt_filter
Esempio n. 21
0
    def poll(self, nonBlocking):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """
        text_line = '-' * 120

        # Set a delimiter, which distinguishes entries b/w different jobs
        delimiter = ','
        # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length):
        # 1) squeue -h -u {{user}} -o '%i %256k'
        #      Collects the list of running jobs
        #        a) -h omits header
        #        b) -u {{user}} looks only for jobs submitted by {{user}}
        #        c) -o '%i %256k' specifies the output format
        #           i)  %i -- job ID (1st column)
        #           ii) %256k -- comment with width of 256 characters (2nd column)
        #               If the job has no comments, the entry simply reads (null)
        # 2) grep {{comment}}
        #       Filter the jobs by the comment which must be unique per sbatchManager instance at all times
        # 3) awk '{print $1}'
        #       Filter only the jobIds out
        # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'
        #       Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read)
        command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \
                           "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'"
        command = jinja2.Template(command_template).render(
            user=self.user,
            pool_id_length=self.max_pool_id_length,
            comment=self.pool_id,
            delimiter=delimiter)

        # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes
        # even if some of them have already finished
        jobIds_set = set([
            job_id for job_id in self.submittedJobs
            if self.submittedJobs[job_id]['status'] == Status.submitted
        ])
        nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
        while nofJobs_left > 0:
            # Get the list of jobs submitted to batch system and convert their jobIds to a set
            poll_result, poll_result_err = '', ''
            while True:
                poll_result, poll_result_err = run_cmd(command,
                                                       do_not_log=False,
                                                       return_stderr=True)
                if not poll_result and poll_result_err:
                    logging.warning(
                        'squeue caught an error: {squeue_error}'.format(
                            squeue_error=poll_result_err))
                else:
                    break
                # sleep a minute and then try again
                # in principle we could limit the number of retries, but hopefully that's not necessary
                logging.debug("sleeping for %i seconds." % 60)
                time.sleep(60)
            polled_ids = set()
            if poll_result != '':
                polled_ids = set(poll_result.split(delimiter))

            # Check if number of jobs submitted to batch system is below maxSubmittedJobs;
            # if it is, take jobs from queuedJobs list and submit them,
            # until a total of maxSubmittedJobs is submitted to batch system
            nofJobs_toSubmit = min(len(self.queuedJobs),
                                   self.maxSubmittedJobs - len(polled_ids))
            if nofJobs_toSubmit > 0:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs."
                    .format(len(polled_ids), len(self.queuedJobs),
                            nofJobs_toSubmit))
            else:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing."
                    .format(len(polled_ids), len(self.queuedJobs)))
            for i in range(0, nofJobs_toSubmit):
                # randomly submit a job from the queue
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_idx = len(self.queuedJobs) - 1
                random_idx = random.randint(0, max_idx)
                job = self.queuedJobs.pop(random_idx)
                job['status'] = Status.submitted
                job_id = self.submit(job['sbatch_command'])
                self.submittedJobs[job_id] = job

            # Now check status of jobs submitted to batch system:
            # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of
            # jobs that have finished already
            finished_ids = list(jobIds_set - polled_ids)

            # Do not poll anything if currently there are no finished jobs
            if finished_ids:
                # Based on job's exit code what if the job has failed or completed successfully
                # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here
                # Therefore, we want to restrict the output by grepping specific job IDs
                # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable,
                # which is of order 2e6
                # This means that we have to split the job IDs into chunks each of which we have to check separately
                finished_ids_chunks = [
                    finished_ids[i:i + self.max_nof_greps]
                    for i in range(0, len(finished_ids), self.max_nof_greps)
                ]
                for finished_ids_chunk in finished_ids_chunks:
                    completion = self.check_job_completion(finished_ids_chunk)
                    completed_jobs, running_jobs, failed_jobs = [], [], []
                    for job_id, details in completion.iteritems():
                        if details.status == Status.completed:
                            completed_jobs.append(job_id)
                        elif details.status == Status.running:
                            running_jobs.append(job_id)
                        else:
                            failed_jobs.append(job_id)
                    # If there are any failed jobs, throw
                    if failed_jobs:

                        failed_jobs_str = ','.join(failed_jobs)
                        errors = [
                            completion[job_id].status for job_id in failed_jobs
                        ]
                        logging.error(
                            "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}"
                            .format(
                                jobIds=failed_jobs_str,
                                reasons=', '.join(map(Status.toString,
                                                      errors)),
                            ))

                        # Let's print a table where the first column corresponds to the job ID
                        # and the second column lists the exit code, the derived exit code, the status
                        # and the classification of the failed job
                        logging.error("Error table:")
                        for job_id in failed_jobs:
                            sys.stderr.write(
                                "{jobId} {exitCode} {derivedExitCode} {state} {status}\n"
                                .format(
                                    jobId=job_id,
                                    exitCode=completion[job_id].exit_code,
                                    derivedExitCode=completion[job_id].
                                    derived_exit_code,
                                    state=completion[job_id].state,
                                    status=Status.toString(
                                        completion[job_id].status),
                                ))

                        sys.stderr.write('%s\n' % text_line)
                        for failed_job in failed_jobs:
                            for log in zip(['wrapper', 'executable'],
                                           ['log_wrap', 'log_exec']):
                                logfile = self.submittedJobs[failed_job][
                                    log[1]]
                                if os.path.isfile(logfile):
                                    logfile_contents = open(logfile,
                                                            'r').read()
                                else:
                                    logfile_contents = '<file is missing>'
                                sys.stderr.write(
                                    'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n'
                                    .format(
                                        id=failed_job,
                                        description=log[0],
                                        path=logfile,
                                        log=logfile_contents,
                                        line=text_line,
                                    ))

                            if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \
                               completion[failed_job].status == Status.io_error:
                                # The job is eligible for resubmission if the job hasn't been resubmitted more
                                # than a preset limit of resubmissions AND if the job failed due to I/O errors
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED because: {reason} "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=failed_job,
                                        args=self.submittedJobs[failed_job]
                                        ['args'],
                                        reason=Status.toString(
                                            completion[failed_job].status),
                                        attempt=self.submittedJobs[failed_job]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[failed_job]['args'])
                                # The old ID must be deleted, b/c otherwise it would be used to compare against
                                # squeue output and we would resubmit the failed job ad infinitum
                                del self.submittedJobs[failed_job]
                            else:
                                # We've exceeded the maximum number of resubmissions -> fail the workflow
                                raise Status.raiseError(
                                    completion[failed_job].status)
                    else:
                        logging.debug(
                            "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}"
                            .format(
                                completedIds=','.join(completed_jobs),
                                runningInfo='(%s still running)' %
                                ','.join(running_jobs) if running_jobs else '',
                            ))
                    # Mark successfully finished jobs as completed so that won't request their status code again
                    # Otherwise they will be still at ,,submitted'' state
                    for job_id in completed_jobs:
                        if not all(
                                map(
                                    lambda outputFile: is_file_ok(
                                        outputFile,
                                        validate_outputs=True,
                                        min_file_size=self.min_file_size), self
                                    .submittedJobs[job_id]['outputFiles'])):
                            if self.submittedJobs[job_id][
                                    'nof_submissions'] < self.max_resubmissions:
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=job_id,
                                        args=self.submittedJobs[job_id]
                                        ['args'],
                                        attempt=self.submittedJobs[job_id]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[job_id]['args'])
                                del self.submittedJobs[job_id]
                            else:
                                raise ValueError(
                                    "Job w/ ID {id} FAILED because it repeatedly produces bogus output "
                                    "file {output} yet the job still exits w/o any errors"
                                    .format(
                                        id=job_id,
                                        output=', '.join(
                                            self.submittedJobs[job_id]
                                            ['outputFiles']),
                                    ))
                        else:
                            # Job completed just fine
                            self.submittedJobs[job_id][
                                'status'] = Status.completed

            jobIds_set = set([
                job_id for job_id in self.submittedJobs
                if self.submittedJobs[job_id]['status'] == Status.submitted
            ])
            nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
            logging.info(
                "Waiting for sbatch to finish (%d job(s) still left) ..." %
                nofJobs_left)
            if nofJobs_left > 0:
                if nonBlocking:
                    return False
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_delay = 300
                random_delay = random.randint(0, max_delay)
                logging.debug("sleeping for %i seconds." % random_delay)
                time.sleep(self.poll_interval + random_delay)
            else:
                break

        return True
Esempio n. 22
0
    def create(self):
        """Creates all necessary config files and runs the Ntuple production -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue

            process_name = sample_info["process_name_specific"]
            is_mc = (sample_info["type"] == "mc")

            if is_mc and process_name not in self.pileup_histograms:
                raise ValueError("Missing PU distribution for %s in file %s" %
                                 (process_name, self.pileup))

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable, process_name))

            inputFileList = generateInputFileList(sample_info,
                                                  self.max_files_per_job)
            key_dir = getKey(sample_name)
            subDirs = list(
                map(
                    lambda y: os.path.join(self.dirs[key_dir][DKEY_NTUPLES],
                                           '%04d' % y),
                    set(map(lambda x: x // 1000, inputFileList.keys()))))
            for subDir in subDirs:
                create_if_not_exists(subDir)
            for jobId in inputFileList.keys():

                key_file = getKey(sample_name, jobId)

                self.inputFiles[key_file] = inputFileList[jobId]
                if len(self.inputFiles[key_file]) == 0:
                    logging.warning(
                        "ntupleFiles['%s'] = %s --> skipping job !!" %
                        (key_file, self.inputFiles[key_file]))
                    continue
                self.cfgFiles_prodNtuple_modified[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "produceNtuple_%s_%i_cfg.py" % (process_name, jobId))
                self.outputFiles[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_NTUPLES], "%04d" % (jobId // 1000),
                    "tree_%i.root" % jobId)
                self.logFiles_prodNtuple[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_LOGS],
                    "produceNtuple_%s_%i.log" % (process_name, jobId))
                hlt_paths = sample_info["hlt_paths"] if not is_mc else []
                hlt_cuts = list(
                    Triggers(self.era).triggers_flat
                ) if self.preselection_cuts["applyHLTcut"] else []
                jobOptions = {
                    'inputFiles': self.inputFiles[key_file],
                    'cfgFile_modified':
                    self.cfgFiles_prodNtuple_modified[key_file],
                    'outputFile': self.outputFiles[key_file],
                    'is_mc': is_mc,
                    'random_seed': jobId,
                    'process_name': process_name,
                    'category_name': sample_info["sample_category"],
                    'triggers': hlt_paths,
                    'HLTcuts': hlt_cuts,
                }
                self.createCfg_prodNtuple(jobOptions)

        num_jobs = 0
        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable)
            num_jobs = self.createScript_sbatch()
            logging.info("Generated %i job(s)" % num_jobs)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_prodNtuple(lines_makefile)
        self.createMakefile(lines_makefile)

        logging.info("Done")
        return num_jobs
Esempio n. 23
0
                sample_name=sample_name, ))

            output_dir_parent = os.path.join(output_dir, sample_name)
            if not os.path.isdir(output_dir_parent):
                os.makedirs(output_dir_parent)

            for sample_subdir_basename in os.listdir(sample_path):
                sample_subdir = os.path.join(sample_path,
                                             sample_subdir_basename)

                for rootfile_basename in os.listdir(sample_subdir):
                    rootfile = os.path.join(sample_subdir, rootfile_basename)
                    if not os.path.isfile(rootfile):
                        continue
                    logging.debug(
                        "Dumping RLE numbers for file '{rootfile_name}'".
                        format(rootfile_name=rootfile, ))

                    rootfile_idx = idx(rootfile_basename)
                    outfile_idx = os.path.join(
                        output_dir_parent, "{i}.txt".format(i=rootfile_idx))
                    if os.path.isfile(outfile_idx):
                        logging.warning(
                            "Whoops, file already exists; skipping that")
                        continue

                    dump_rle(rootfile, outfile_idx, args.tree, args.run,
                             args.lumi, args.event)

    logging.debug("Done!")
Esempio n. 24
0
  def create(self):
    """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system
    """

    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue

      process_name = sample_info["process_name_specific"]
      sample_category = sample_info["sample_category"]
      is_mc = (sample_info["type"] == "mc")

      logging.info("Building dictionaries for sample %s..." % process_name)
      for lepton_selection in self.lepton_selections:
        for lepton_frWeight in self.lepton_frWeights:
          if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"):
            continue
          if lepton_frWeight == "disabled" and not lepton_selection in ["Tight"]:
            continue

          lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight)
          for lepton_charge_selection in self.lepton_charge_selections:

            if 'mcClosure' in lepton_selection and lepton_charge_selection != 'SS':
              # Run MC closure only for the region that complements the SR
              continue

            central_or_shift_extensions = ["", "hadd", "addBackgrounds"]
            central_or_shift_dedicated = self.central_or_shifts if self.runTHweights(sample_info) else self.central_or_shifts_external
            central_or_shifts_extended = central_or_shift_extensions + central_or_shift_dedicated
            for central_or_shift_or_dummy in central_or_shifts_extended:
              process_name_extended = [ process_name, "hadd" ]
              for process_name_or_dummy in process_name_extended:
                if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]:
                  continue

                if central_or_shift_or_dummy not in central_or_shift_extensions and not self.accept_systematics(
                    central_or_shift_or_dummy, is_mc, lepton_selection, lepton_charge_selection, sample_info
                ):
                  continue

                key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, lepton_charge_selection, central_or_shift_or_dummy)
                for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES, DKEY_SYNC ]:
                  if dir_type == DKEY_SYNC and not self.do_sync:
                    continue
                  initDict(self.dirs, [ key_dir, dir_type ])
                  if dir_type in [ DKEY_CFGS, DKEY_LOGS ]:
                    self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel,
                      "_".join([ lepton_selection_and_frWeight, lepton_charge_selection ]), process_name_or_dummy, central_or_shift_or_dummy)
                  else:
                    self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel,
                      "_".join([ lepton_selection_and_frWeight, lepton_charge_selection ]), process_name_or_dummy)
    for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "addBackgroundLeptonFlips", "prepareDatacards", "addSystFakeRates", "makePlots" ]:
      key_dir = getKey(subdirectory)
      for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]:
        initDict(self.dirs, [ key_dir, dir_type ])
        if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]:
          self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory)
        else:
          self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory)                
    for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]:
      if dir_type == DKEY_SYNC and not self.do_sync:
        continue
      initDict(self.dirs, [ dir_type ])
      if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]:
        self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel)
      else:
        self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel)

    numDirectories = 0
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        numDirectories += len(self.dirs[key])
      else:
        numDirectories += 1
    logging.info("Creating directory structure (numDirectories = %i)" % numDirectories)
    numDirectories_created = 0;
    frac = 1
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        for dir_type in self.dirs[key].keys():
          create_if_not_exists(self.dirs[key][dir_type])
        numDirectories_created += len(self.dirs[key])
      else:
        create_if_not_exists(self.dirs[key])
        numDirectories_created = numDirectories_created + 1
      while 100*numDirectories_created >= frac*numDirectories:
        logging.info(" %i%% completed" % frac)
        frac = frac + 1
    logging.info("Done.")    

    inputFileLists = {}
    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"]:
        continue
      logging.info("Checking input files for sample %s" % sample_info["process_name_specific"])
      inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job)

    mcClosure_regex = re.compile('Fakeable_mcClosure_(?P<type>m|e)_wFakeRateWeights')
    for lepton_selection in self.lepton_selections:
      electron_selection = lepton_selection
      muon_selection = lepton_selection

      hadTauVeto_selection = "Tight"
      hadTauVeto_selection = "|".join([ hadTauVeto_selection, self.hadTauVeto_selection_part2 ])

      if lepton_selection == "Fakeable_mcClosure_e":
        electron_selection = "Fakeable"
        muon_selection = "Tight"
      elif lepton_selection == "Fakeable_mcClosure_m":
        electron_selection = "Tight"
        muon_selection = "Fakeable"

      for lepton_frWeight in self.lepton_frWeights:
        if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"):
          continue
        if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight" ]:
          continue
        lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight)

        for lepton_charge_selection in self.lepton_charge_selections:

          if 'mcClosure' in lepton_selection and lepton_charge_selection != 'SS':
            # Run MC closure only for the region that complements the SR
            continue

          for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
              continue

            process_name = sample_info["process_name_specific"]
            logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name))
            inputFileList = inputFileLists[sample_name]

            sample_category = sample_info["sample_category"]
            is_mc = (sample_info["type"] == "mc")
            use_th_weights = self.runTHweights(sample_info)

            central_or_shift_dedicated = self.central_or_shifts if use_th_weights else self.central_or_shifts_external
            for central_or_shift in central_or_shift_dedicated:
              if not self.accept_systematics(
                  central_or_shift, is_mc, lepton_selection, lepton_charge_selection, sample_info
              ):
                continue

              central_or_shifts_local = []
              if central_or_shift == "central" and not use_th_weights:
                for central_or_shift_local in self.central_or_shifts_internal:
                  if self.accept_systematics(
                      central_or_shift_local, is_mc, lepton_selection, lepton_charge_selection, sample_info
                  ):
                    central_or_shifts_local.append(central_or_shift_local)

              logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift))

              # build config files for executing analysis code
              key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection, central_or_shift)

              for jobId in inputFileList.keys():

                analyze_job_tuple = (process_name, lepton_selection_and_frWeight, lepton_charge_selection, central_or_shift, jobId)
                key_analyze_job = getKey(*analyze_job_tuple)
                ntupleFiles = inputFileList[jobId]
                if len(ntupleFiles) == 0:
                  logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job))
                  continue

                syncOutput = ''
                syncTree = ''
                syncGenMatch = self.lepton_genMatches_nonfakes
                if self.do_sync:
                  mcClosure_match = mcClosure_regex.match(lepton_selection_and_frWeight)
                  if lepton_selection_and_frWeight == 'Tight':
                    if lepton_charge_selection == 'SS':
                      syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_SR.root' % (self.channel, central_or_shift))
                      syncTree = 'syncTree_%s_SR' % self.channel
                    elif lepton_charge_selection == 'OS':
                      syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Flip.root' % (self.channel, central_or_shift))
                      syncTree = 'syncTree_%s_Flip' % self.channel
                    else:
                      continue
                  elif lepton_selection_and_frWeight == 'Fakeable_wFakeRateWeights' and lepton_charge_selection == 'SS':
                    syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Fake.root' % (self.channel, central_or_shift))
                    syncTree = 'syncTree_%s_Fake' % self.channel
                  elif mcClosure_match and lepton_charge_selection == 'SS':
                    mcClosure_type = mcClosure_match.group('type')
                    syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_mcClosure_%s.root' % (self.channel, central_or_shift, mcClosure_type))
                    syncTree = 'syncTree_%s_mcClosure_%s' % (self.channel, mcClosure_type)
                  else:
                    continue
                if syncTree and central_or_shift != "central":
                  syncTree = os.path.join(central_or_shift, syncTree)
                syncRLE = ''
                if self.do_sync and self.rle_select:
                  syncRLE = self.rle_select % syncTree
                  if not os.path.isfile(syncRLE):
                    logging.warning("Input RLE file for the sync is missing: %s; skipping the job" % syncRLE)
                    continue
                if syncOutput:
                  self.inputFiles_sync['sync'].append(syncOutput)

                cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % analyze_job_tuple)
                logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % analyze_job_tuple)
                rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%i.txt" % analyze_job_tuple) \
                                     if self.select_rle_output else ""
                histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%s_%i.root" % analyze_job_tuple)

                self.jobOptions_analyze[key_analyze_job] = {
                  'ntupleFiles'              : ntupleFiles,
                  'cfgFile_modified'         : cfgFile_modified_path,
                  'histogramFile'            : histogramFile_path,
                  'logFile'                  : logFile_path,
                  'selEventsFileName_output' : rleOutputFile_path,
                  'electronSelection'        : electron_selection,
                  'muonSelection'            : muon_selection,
                  'apply_leptonGenMatching'  : self.apply_leptonGenMatching,
                  'leptonChargeSelection'    : lepton_charge_selection,
                  'hadTauSelection_veto'     : hadTauVeto_selection,
                  'applyFakeRateWeights'     : self.applyFakeRateWeights if not lepton_selection == "Tight" else "disabled",
                  'central_or_shift'         : central_or_shift,
                  'central_or_shifts_local'  : central_or_shifts_local,
                  'syncOutput'               : syncOutput,
                  'syncTree'                 : syncTree,
                  'syncRLE'                  : syncRLE,
                  'useNonNominal'            : self.use_nonnominal,
                  'apply_hlt_filter'         : self.hlt_filter,
                  'syncGenMatch'             : syncGenMatch,
                }
                self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection)

                # initialize input and output file names for hadd_stage1
                key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection)
                hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight, lepton_charge_selection)
                key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple)
                if not key_hadd_stage1_job in self.inputFiles_hadd_stage1:
                  self.inputFiles_hadd_stage1[key_hadd_stage1_job] = []
                self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile'])
                self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST],
                                                                                "hadd_stage1_%s_%s_%s.root" % hadd_stage1_job_tuple)

            if self.do_sync: continue

            if is_mc:
              logging.info("Creating configuration files to run 'addBackgrounds' for sample %s" % process_name)

              sample_categories = [ sample_category ]
              for sample_category in sample_categories:
                # sum non-fake and fake contributions for each MC sample separately
                genMatch_categories = [ "nonfake", "Convs", "fake", "flip" ]
                for genMatch_category in genMatch_categories:
                  key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection)
                  key_addBackgrounds_dir = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection, "addBackgrounds")
                  addBackgrounds_job_tuple = None
                  processes_input = None
                  process_output = None
                  if genMatch_category == "nonfake":
                    # sum non-fake contributions for each MC sample separately
                    # input processes: TT2l0g0j; ...
                    # output processes: TT; ...
                    if sample_category in self.ttHProcs:
                      lepton_genMatches = []
                      lepton_genMatches.extend(self.lepton_genMatches_nonfakes)
                      lepton_genMatches.extend(self.lepton_genMatches_Convs)
                      processes_input = []
                      processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in lepton_genMatches ])
                      processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in lepton_genMatches ])
                      processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in lepton_genMatches ])
                      processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in lepton_genMatches ])
                      processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in lepton_genMatches ])
                    else:
                      processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_nonfakes ]
                    process_output = sample_category
                    addBackgrounds_job_tuple = (process_name, sample_category, lepton_selection_and_frWeight, lepton_charge_selection)
                  elif genMatch_category == "Convs":
                    # sum conversion background  contributions for each MC sample separately
                    # input processes: TT1l1g0j, TT0l2g0j; ...
                    # output processes: TT_Convs; ...
                    if sample_category in self.ttHProcs:
                      processes_input = []
                      processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_Convs ])
                      processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_Convs ])
                      processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_Convs ])
                      processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_Convs ])
                      processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_Convs ])
                    else:
                      processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_Convs ]
                    process_output = "%s_Convs" % sample_category
                    addBackgrounds_job_tuple = (process_name, "%s_Convs" % sample_category, lepton_selection_and_frWeight, lepton_charge_selection)
                  elif genMatch_category == "fake":
                    # sum fake contributions for each MC sample separately
                    # input processes: TT1l0g1j, TT0l1g1j, TT0l0g2j; ...
                    # output processes: TT_fake; ...
                    if sample_category in self.ttHProcs:
                      processes_input = []
                      processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                      processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                      processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                      processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                      processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    else:
                      processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ]
                    process_output = "%s_fake" % sample_category
                    addBackgrounds_job_tuple = (process_name, "%s_fake" % sample_category, lepton_selection_and_frWeight, lepton_charge_selection)
                  elif genMatch_category == "flip":
                    # sum flip contributions for each MC sample separately
                    # input processes:  TT2l2f0g0j&2t0e0m0j, TT2l1f0g0j&2t0e0m0j; ...
                    # output processes: TT_flip; ...
                    if sample_category in self.ttHProcs:
                      processes_input = []
                      processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_flips ])
                      processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_flips ])
                      processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_flips ])
                      processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_flips ])
                      processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_flips ])
                    else:
                      processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_flips ]
                    process_output = "%s_flip" % sample_category
                    addBackgrounds_job_tuple = (process_name, "%s_flip" % sample_category, lepton_selection_and_frWeight, lepton_charge_selection)
                  if processes_input:
                    logging.info(" ...for genMatch option = '%s'" % genMatch_category)
                    key_addBackgrounds_job = getKey(*addBackgrounds_job_tuple)
                    cfgFile_modified = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_%s_cfg.py" % addBackgrounds_job_tuple)
                    outputFile = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s_%s.root" % addBackgrounds_job_tuple)
                    self.jobOptions_addBackgrounds[key_addBackgrounds_job] = {
                      'inputFile' : self.outputFile_hadd_stage1[key_hadd_stage1_job],
                      'cfgFile_modified' : cfgFile_modified,
                      'outputFile' : outputFile,
                      'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(cfgFile_modified).replace("_cfg.py", ".log")),
                      'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, lepton_charge_selection) ],
                      'processes_input' : processes_input,
                      'process_output' : process_output
                    }
                    self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds[key_addBackgrounds_job])

                    # initialize input and output file names for hadd_stage1_5
                    key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight, lepton_charge_selection)
                    hadd_stage1_5_job_tuple = (lepton_selection_and_frWeight, lepton_charge_selection)
                    key_hadd_stage1_5_job = getKey(*hadd_stage1_5_job_tuple)
                    if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5:
                      self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = []
                    self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.jobOptions_addBackgrounds[key_addBackgrounds_job]['outputFile'])
                    self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST],
                                                                                        "hadd_stage1_5_%s_%s.root" % hadd_stage1_5_job_tuple)

            # add output files of hadd_stage1 for data to list of input files for hadd_stage1_5
            if not is_mc:
              key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight, lepton_charge_selection)
              key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, lepton_charge_selection)
              if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5:
                self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = []
              self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job])

          if self.do_sync: continue

          # sum fake background contributions for the total of all MC samples
          # input processes: TT1l0g1j, TT0l1g1j, TT0l0g2j; ...
          # output process: fakes_mc
          key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, lepton_charge_selection)
          key_addBackgrounds_dir = getKey("addBackgrounds")
          addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight, lepton_charge_selection)
          key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple)
          sample_categories = []
          sample_categories.extend(self.nonfake_backgrounds)
          sample_categories.extend(self.ttHProcs)
          processes_input = []
          for sample_category in sample_categories:
            processes_input.append("%s_fake" % sample_category)
          self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = {
            'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
            'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple),
            'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_fakes_tuple),
            'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_fakes_tuple),
            'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, lepton_charge_selection) ],
            'processes_input' : processes_input,
            'process_output' : "fakes_mc"
          }
          self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes])

          # sum flip background contributions for the total of all MC sample
          # input processes: TT2l1f0g0j,TT2l2f0g0j; ...
          # output process: flips_mc
          addBackgrounds_job_flips_tuple = ("flips_mc", lepton_selection_and_frWeight, lepton_charge_selection)
          key_addBackgrounds_job_flips = getKey(*addBackgrounds_job_flips_tuple)
          sample_categories = []
          sample_categories.extend(self.nonfake_backgrounds)
          sample_categories.extend(self.ttHProcs)
          processes_input = []
          for sample_category in sample_categories:
            processes_input.append("%s_flip" % sample_category)
          self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_flips] = {
            'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
            'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_flips_tuple),
            'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_flips_tuple),
            'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_flips_tuple),
            'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, lepton_charge_selection) ],
            'processes_input' : processes_input,
            'process_output' : "flips_mc"
          }
          self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_flips])

          # sum conversion background contributions for the total of all MC samples
          # input processes: TT1l1g0j, TT0l2g0j; ...
          # output process: Convs
          addBackgrounds_job_Convs_tuple = ("Convs", lepton_selection_and_frWeight, lepton_charge_selection)
          key_addBackgrounds_job_Convs = getKey(*addBackgrounds_job_Convs_tuple)
          sample_categories = []
          sample_categories.extend(self.nonfake_backgrounds)
          sample_categories.extend(self.ttHProcs)
          processes_input = []
          for sample_category in self.convs_backgrounds:
            processes_input.append("%s_Convs" % sample_category)
          self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs] = {
            'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
            'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_Convs_tuple),
            'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_Convs_tuple),
            'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_Convs_tuple),
            'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, lepton_charge_selection) ],
            'processes_input' : processes_input,
            'process_output' : "Convs"
          }
          self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs])

          # initialize input and output file names for hadd_stage2
          key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, lepton_charge_selection)
          key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight, lepton_charge_selection)
          hadd_stage2_job_tuple = (lepton_selection_and_frWeight, lepton_charge_selection)
          key_hadd_stage2_job = getKey(*hadd_stage2_job_tuple)
          if not key_hadd_stage2_job in self.inputFiles_hadd_stage2:
            self.inputFiles_hadd_stage2[key_hadd_stage2_job] = []
          if lepton_selection == "Tight":
            self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'])
            self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_flips]['outputFile'])
            self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs]['outputFile'])
          self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job])
          self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST],
                                                                          "hadd_stage2_%s_%s.root" % hadd_stage2_job_tuple)

    if self.do_sync:
      if self.is_sbatch:
        logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
        self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
        self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      logging.info("Creating Makefile")
      lines_makefile = []
      self.addToMakefile_syncNtuple(lines_makefile)
      outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel)
      self.outputFile_sync['sync'] = outputFile_sync_path
      self.addToMakefile_hadd_sync(lines_makefile)
      self.addToMakefile_validate(lines_makefile)
      self.targets.extend(self.phoniesToAdd)
      self.createMakefile(lines_makefile)
      logging.info("Done.")
      return self.num_jobs

    logging.info("Creating configuration files to run 'addBackgroundFakes'")
    for lepton_charge_selection in self.lepton_charge_selections:
      key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled"), lepton_charge_selection)
      key_addFakes_dir = getKey("addBackgroundLeptonFakes")
      addFakes_job_tuple = (lepton_charge_selection)
      key_addFakes_job = getKey("data_fakes", lepton_charge_selection)
      category_sideband = None
      if self.applyFakeRateWeights == "2lepton":
        category_sideband = "ttWctrl_%s_Fakeable_wFakeRateWeights" % lepton_charge_selection
      else:
        raise ValueError("Invalid Configuration parameter 'applyFakeRateWeights' = %s !!" % self.applyFakeRateWeights)
      self.jobOptions_addFakes[key_addFakes_job] = {
        'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_addFakes_dir][DKEY_CFGS], "addBackgroundLeptonFakes_%s_cfg.py" % addFakes_job_tuple),
        'outputFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_HIST], "addBackgroundLeptonFakes_%s.root" % addFakes_job_tuple),
        'logFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_LOGS], "addBackgroundLeptonFakes_%s.log" % addFakes_job_tuple),
        'category_signal' : "ttWctrl_%s_Tight" % lepton_charge_selection,
        'category_sideband' : category_sideband
      }
      self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job])
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), lepton_charge_selection)
      self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile'])

    #--------------------------------------------------------------------------
    # CV: add histograms in OS and SS regions,
    #     so that "data_fakes" background can be subtracted from OS control region used to estimate charge flip background
    key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS")
    key_addFakes_job = getKey("data_fakes", "OS")
    key_hadd_stage1_6_dir = getKey("hadd", get_lepton_selection_and_frWeight("Tight", "disabled"), "OS")
    key_hadd_stage1_6_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS")
    if key_hadd_stage1_6_job not in self.inputFiles_hadd_stage1_6:
        self.inputFiles_hadd_stage1_6[key_hadd_stage1_6_job] = []
    self.inputFiles_hadd_stage1_6[key_hadd_stage1_6_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile'])
    self.inputFiles_hadd_stage1_6[key_hadd_stage1_6_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job])
    self.outputFile_hadd_stage1_6[key_hadd_stage1_6_job] = os.path.join(self.dirs[key_hadd_stage1_6_dir][DKEY_HIST], "hadd_stage1_6_Tight_OS.root")
    #--------------------------------------------------------------------------

    logging.info("Creating configuration files to run 'addBackgroundFlips'")
    key_addFlips_dir = getKey("addBackgroundLeptonFlips")
    key_addFlips_job = getKey("data_flips")
    self.jobOptions_addFlips[key_addFlips_job] = {
      'inputFile' : self.outputFile_hadd_stage1_6,
      'cfgFile_modified' : os.path.join(self.dirs[key_addFlips_dir][DKEY_CFGS], "addBackgroundLeptonFlips_cfg.py"),
      'outputFile' : os.path.join(self.dirs[key_addFlips_dir][DKEY_HIST], "addBackgroundLeptonFlips.root"),
      'logFile' : os.path.join(self.dirs[key_addFlips_dir][DKEY_LOGS], "addBackgroundLeptonFlips.log"),
      'category_signal' : "ttWctrl_SS_Tight",
      'category_sideband' : "ttWctrl_OS_Tight"
    }
    self.createCfg_addFlips(self.jobOptions_addFlips[key_addFlips_job])
    key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS")
    self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFlips[key_addFlips_job]['outputFile'])

    logging.info("Creating configuration files to run 'prepareDatacards'")
    for histogramToFit in self.histograms_to_fit:
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS")
      key_prep_dcard_dir = getKey("prepareDatacards")
      prep_dcard_job_tuple = (self.channel, histogramToFit)
      key_prep_dcard_job = getKey(histogramToFit)      
      self.jobOptions_prep_dcard[key_prep_dcard_job] = {
        'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple),
        'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple),
        'histogramDir' : self.histogramDir_prep_dcard,
        'histogramToFit' : histogramToFit,
        'label' : None
      }
      self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job])

      # add shape templates for the following systematic uncertainties:
      #  - 'CMS_ttHl_Clos_norm_e'
      #  - 'CMS_ttHl_Clos_shape_e'
      #  - 'CMS_ttHl_Clos_norm_m'
      #  - 'CMS_ttHl_Clos_shape_m'
      key_add_syst_fakerate_dir = getKey("addSystFakeRates")
      add_syst_fakerate_job_tuple = (self.channel, histogramToFit)
      key_add_syst_fakerate_job = getKey(histogramToFit)
      self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = {
        'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'],
        'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_cfg.py" % add_syst_fakerate_job_tuple),
        'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s.root" % add_syst_fakerate_job_tuple),
        'category' : self.channel,
        'histogramToFit' : histogramToFit,
        'plots_outputFileName' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_PLOT], "addSystFakeRates.png")
      }
      histogramDir_nominal = self.histogramDir_prep_dcard
      for lepton_type in [ 'e', 'm' ]:
        lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type
        if lepton_mcClosure not in self.lepton_selections:
          continue
        lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled")
        key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight, 'SS')
        histogramDir_mcClosure = self.mcClosure_dir['%s_%s' % (lepton_mcClosure, 'SS')]
        self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({
          'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections,
          'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job],
          'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit),
          'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'],
          'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit)
        })
      self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job])

      logging.info("Creating configuration files to run 'makePlots'")
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS")
      key_makePlots_dir = getKey("makePlots")
      key_makePlots_job = getKey("SS")
      self.jobOptions_make_plots[key_makePlots_job] = {
        'executable' : self.executable_make_plots,
        'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel),
        'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel),
        'histogramDir' : self.histogramDir_prep_dcard,
        'label' : "t#bar{t}W control region",
        'make_plots_backgrounds' : self.make_plots_backgrounds
      }
      self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job])

    self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
    self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel)
    self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel)
    self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel)
    self.sbatchFile_addFlips = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFlips_%s.py" % self.channel)
    if self.is_sbatch:
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
      self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds)
      self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds)
      self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes)
      self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFlips)
      self.createScript_sbatch(self.executable_addFlips, self.sbatchFile_addFlips, self.jobOptions_addFlips)

    logging.info("Creating Makefile")
    lines_makefile = []
    self.addToMakefile_analyze(lines_makefile)
    self.addToMakefile_hadd_stage1(lines_makefile)
    self.addToMakefile_backgrounds_from_data_withFlips(lines_makefile)
    self.addToMakefile_hadd_stage2(lines_makefile)
    self.addToMakefile_prep_dcard(lines_makefile)
    self.addToMakefile_add_syst_fakerate(lines_makefile)
    self.addToMakefile_make_plots(lines_makefile)
    self.addToMakefile_validate(lines_makefile)
    self.createMakefile(lines_makefile)

    logging.info("Done.")

    return self.num_jobs
Esempio n. 25
0
  def create(self):
    """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system
    """

    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]:
        continue
      process_name = sample_info["process_name_specific"]
      for lepton_selection in self.lepton_selections:
        for lepton_frWeight in self.lepton_frWeights:
          if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"):
            continue
          lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight)
          central_or_shifts_extended = [ "" ]
          central_or_shifts_extended.extend(self.central_or_shifts)
          central_or_shifts_extended.extend([ "hadd", "addBackgrounds" ])
          for central_or_shift_or_dummy in central_or_shifts_extended:
            process_name_extended = [ process_name, "hadd" ]
            for process_name_or_dummy in process_name_extended:
              key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, central_or_shift_or_dummy)
              for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_RLES, DKEY_SYNC ]:
                initDict(self.dirs, [ key_dir, dir_type ])
                if dir_type in [ DKEY_CFGS, DKEY_LOGS ]:
                  self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel,
                    "_".join([ lepton_selection_and_frWeight ]), process_name_or_dummy, central_or_shift_or_dummy)
                else:
                  self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel,
                    "_".join([ lepton_selection_and_frWeight ]), process_name_or_dummy, central_or_shift_or_dummy)
    for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "prepareDatacards", "addSystFakeRates", "makePlots" ]:
      key_dir = getKey(subdirectory)
      for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_DCRD, DKEY_PLOT ]:
        initDict(self.dirs, [ key_dir, dir_type ])
        if dir_type in [ DKEY_CFGS, DKEY_LOGS ]:
          self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory)
        else:
          self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory)
    for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]:
      initDict(self.dirs, [ dir_type ])
      if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]:
        self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel)
      else:
        self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel)

    numDirectories = 0
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        numDirectories += len(self.dirs[key])
      else:
        numDirectories += 1
    logging.info("Creating directory structure (numDirectories = %i)" % numDirectories)
    numDirectories_created = 0;
    frac = 1
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        for dir_type in self.dirs[key].keys():
          create_if_not_exists(self.dirs[key][dir_type])
        numDirectories_created += len(self.dirs[key])
      else:
        create_if_not_exists(self.dirs[key])
        numDirectories_created = numDirectories_created + 1
      while 100*numDirectories_created >= frac*numDirectories:
        logging.info(" %i%% completed" % frac)
        frac = frac + 1
    logging.info("Done.")

    inputFileLists = {}
    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]:
        continue
      logging.info("Checking input files for sample %s" % sample_info["process_name_specific"])
      inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job)

    mcClosure_regex = re.compile('Fakeable_mcClosure_(?P<type>m|e)_wFakeRateWeights')
    for lepton_selection in self.lepton_selections:
      electron_selection = lepton_selection
      muon_selection = lepton_selection

      hadTauVeto_selection = "Tight"
      hadTauVeto_selection = "|".join([ hadTauVeto_selection, self.hadTauVeto_selection_part2 ])

      if lepton_selection == "Fakeable_mcClosure_e":
        electron_selection = "Fakeable"
        muon_selection = "Tight"
      elif lepton_selection == "Fakeable_mcClosure_m":
        electron_selection = "Tight"
        muon_selection = "Fakeable"

      for lepton_frWeight in self.lepton_frWeights:
        if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"):
          continue
        if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight" ]:
          continue
        lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight)

        for sample_name, sample_info in self.samples.items():
          if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]:
            continue
          process_name = sample_info["process_name_specific"]
          logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name))

          sample_category = sample_info["sample_category"]
          is_mc = (sample_info["type"] == "mc")
          is_signal = (sample_category == "signal")

          for central_or_shift in self.central_or_shifts:

            inputFileList = inputFileLists[sample_name]
            for jobId in inputFileList.keys():
              if central_or_shift != "central":
                isFR_shape_shift = (central_or_shift in systematics.FR_all)
                if not ((lepton_selection == "Fakeable" and isFR_shape_shift) or lepton_selection == "Tight"):
                  continue
                if not is_mc and not isFR_shape_shift:
                  continue

              if central_or_shift in systematics.LHE().ttH and sample_category != "signal":
                continue
              if central_or_shift in systematics.LHE().ttW and sample_category != "TTW":
                continue
              if central_or_shift in systematics.LHE().ttZ and sample_category != "TTZ":
                continue
              if central_or_shift in systematics.DYMCReweighting and not is_dymc_reweighting(sample_name):
                continue

              logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift))

              # build config files for executing analysis code
              key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, central_or_shift)
              analyze_job_tuple = (process_name, lepton_selection_and_frWeight, central_or_shift, jobId)
              key_analyze_job = getKey(*analyze_job_tuple)
              ntupleFiles = inputFileList[jobId]
              if len(ntupleFiles) == 0:
                logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job))
                continue

              syncOutput = ''
              syncTree = ''
              syncRequireGenMatching = True
              if self.do_sync:
                mcClosure_match = mcClosure_regex.match(lepton_selection_and_frWeight)
                if lepton_selection_and_frWeight == 'Tight':
                  syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_SR.root' % (self.channel, central_or_shift))
                  syncTree = 'syncTree_%s_SR' % self.channel
                  syncRequireGenMatching = True
                elif lepton_selection_and_frWeight == 'Fakeable_wFakeRateWeights':
                  syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Fake.root' % (self.channel, central_or_shift))
                  syncTree = 'syncTree_%s_Fake' % self.channel
                elif mcClosure_match:
                  mcClosure_type = mcClosure_match.group('type')
                  syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_mcClosure_%s.root' % (self.channel, central_or_shift, mcClosure_type))
                  syncTree = 'syncTree_%s_mcClosure_%s' % (self.channel, mcClosure_type)
                else:
                  continue
              if syncTree and central_or_shift != "central":
                syncTree = os.path.join(central_or_shift, syncTree)
              syncRLE = ''
              if self.do_sync and self.rle_select:
                syncRLE = self.rle_select % syncTree
                if not os.path.isfile(syncRLE):
                  logging.warning("Input RLE file for the sync is missing: %s; skipping the job" % syncRLE)
                  continue
              if syncOutput:
                self.inputFiles_sync['sync'].append(syncOutput)

              cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple)
              logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple)
              rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \
                                   if self.select_rle_output else ""
              histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple)

              self.jobOptions_analyze[key_analyze_job] = {
                'ntupleFiles'              : ntupleFiles,
                'cfgFile_modified'         : cfgFile_modified_path,
                'histogramFile'            : histogramFile_path,
                'logFile'                  : logFile_path,
                'selEventsFileName_output' : rleOutputFile_path,
                'electronSelection'        : electron_selection,
                'muonSelection'            : muon_selection,
                'apply_leptonGenMatching'  : self.apply_leptonGenMatching,
                'hadTauSelection_veto'     : hadTauVeto_selection,
                'applyFakeRateWeights'     : self.applyFakeRateWeights if not lepton_selection == "Tight" else "disabled",
                'central_or_shift'         : central_or_shift,
                'syncOutput'               : syncOutput,
                'syncTree'                 : syncTree,
                'syncRLE'                  : syncRLE,
                'syncRequireGenMatching'   : syncRequireGenMatching,
                'useNonNominal'            : self.use_nonnominal,
                'apply_hlt_filter'         : self.hlt_filter,
              }
              self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection)

              # initialize input and output file names for hadd_stage1
              key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight)
              hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight)
              key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple)
              if not key_hadd_stage1_job in self.inputFiles_hadd_stage1:
                self.inputFiles_hadd_stage1[key_hadd_stage1_job] = []
              self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile'])
              self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST],
                                                                              "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple)

          if self.do_sync: continue

          if is_mc:
            logging.info("Creating configuration files to run 'addBackgrounds' for sample %s" % process_name)

            sample_categories = [ sample_category ]
            if is_signal:
              sample_categories = [ "signal", "ttH", "ttH_htt", "ttH_hww", "ttH_hzz", "ttH_hmm", "ttH_hzg" ]
            for sample_category in sample_categories:
              # sum non-fake and fake contributions for each MC sample separately
              genMatch_categories = [ "nonfake", "conversions", "fake" ]
              for genMatch_category in genMatch_categories:
                key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight)
                key_addBackgrounds_dir = getKey(process_name, lepton_selection_and_frWeight, "addBackgrounds")
                addBackgrounds_job_tuple = None
                processes_input = None
                process_output = None
                if genMatch_category == "nonfake":
                  # sum non-fake contributions for each MC sample separately
                  # input processes: TT3l0g0j,...
                  # output processes: TT; ...
                  if sample_category in [ "signal" ]:
                    lepton_genMatches = []
                    lepton_genMatches.extend(self.lepton_genMatches_nonfakes)
                    lepton_genMatches.extend(self.lepton_genMatches_conversions)
                    lepton_genMatches.extend(self.lepton_genMatches_fakes)
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in lepton_genMatches ]
                  elif sample_category in [ "ttH" ]:
                    lepton_genMatches = []
                    lepton_genMatches.extend(self.lepton_genMatches_nonfakes)
                    lepton_genMatches.extend(self.lepton_genMatches_conversions)
                    processes_input = []
                    processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in lepton_genMatches ])
                    processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in lepton_genMatches ])
                    processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in lepton_genMatches ])
                    processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in lepton_genMatches ])
                    processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in lepton_genMatches ])
                  else:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_nonfakes ]
                  process_output = sample_category
                  addBackgrounds_job_tuple = (process_name, sample_category, lepton_selection_and_frWeight)
                elif genMatch_category == "conversions":
                  # sum fake contributions for each MC sample separately
                  # input processes: TT2l1g0j, TT1l2g0j, TT0l3g0j; ...
                  # output processes: TT_conversion; ...
                  if sample_category in [ "signal" ]:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_conversions ]
                  elif sample_category in [ "ttH" ]:
                    processes_input = []
                    processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                    processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                    processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                    processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                    processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                  else:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_conversions ]
                  process_output = "%s_conversion" % sample_category
                  addBackgrounds_job_tuple = (process_name, "%s_conversion" % sample_category, lepton_selection_and_frWeight)
                elif genMatch_category == "fake":
                  # sum fake contributions for each MC sample separately
                  # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l2g1j, TT0l1g2j, TT0l0g3j; ...
                  # output processes: TT_fake; ...
                  if sample_category in [ "signal" ]:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ]
                  elif sample_category in [ "ttH" ]:
                    processes_input = []
                    processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                  else:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ]
                  process_output = "%s_fake" % sample_category
                  addBackgrounds_job_tuple = (process_name, "%s_fake" % sample_category, lepton_selection_and_frWeight)
                if processes_input:
                  logging.info(" ...for genMatch option = '%s'" % genMatch_category)
                  key_addBackgrounds_job = getKey(*addBackgrounds_job_tuple)
                  cfgFile_modified = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_tuple)
                  outputFile = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_tuple)
                  self.jobOptions_addBackgrounds[key_addBackgrounds_job] = {
                    'inputFile' : self.outputFile_hadd_stage1[key_hadd_stage1_job],
                    'cfgFile_modified' : cfgFile_modified,
                    'outputFile' : outputFile,
                    'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(cfgFile_modified).replace("_cfg.py", ".log")),
                    'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ],
                    'processes_input' : processes_input,
                    'process_output' : process_output
                  }
                  self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds[key_addBackgrounds_job])

                  # initialize input and output file names for hadd_stage1_5
                  key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight)                  
                  key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight)
                  if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5:
                    self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = []
                  self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.jobOptions_addBackgrounds[key_addBackgrounds_job]['outputFile'])
                  self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST],
                                                                                      "hadd_stage1_5_%s.root" % lepton_selection_and_frWeight)

          # add output files of hadd_stage1 for data to list of input files for hadd_stage1_5
          if not is_mc:
            key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight)
            key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight)
            if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5:
              self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = []
            self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job])

        if self.do_sync: continue

        # sum fake background contributions for the total of all MC sample
        # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l3j, TT0l3j, TT0l3j, TT0l3j; ...
        # output process: fakes_mc
        key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight)
        key_addBackgrounds_dir = getKey("addBackgrounds")
        addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight)
        key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple)
        sample_categories = []
        sample_categories.extend(self.nonfake_backgrounds)
        sample_categories.extend([ "signal" ])
        processes_input = []
        for sample_category in sample_categories:
          processes_input.append("%s_fake" % sample_category)
        self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = {
          'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
          'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple),
          'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s.root" % addBackgrounds_job_fakes_tuple),
          'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s.log" % addBackgrounds_job_fakes_tuple),
          'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ],
          'processes_input' : processes_input,
          'process_output' : "fakes_mc"
        }
        self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes])

        # sum conversion background contributions for the total of all MC sample
        # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l3j, TT0l3j, TT0l3j, TT0l3j; ...
        # output process: conversions
        addBackgrounds_job_conversions_tuple = ("conversions", lepton_selection_and_frWeight)
        key_addBackgrounds_job_conversions = getKey(*addBackgrounds_job_conversions_tuple)
        sample_categories = []
        sample_categories.extend(self.nonfake_backgrounds)
        sample_categories.extend([ "signal" ])
        processes_input = []
        for sample_category in sample_categories:
          processes_input.append("%s_conversion" % sample_category)
        self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions] = {
          'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
          'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_cfg.py" % addBackgrounds_job_conversions_tuple),
          'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s.root" % addBackgrounds_job_conversions_tuple),
          'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s.log" % addBackgrounds_job_conversions_tuple),
          'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ],
          'processes_input' : processes_input,
          'process_output' : "conversions"
        }
        self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions])

        # initialize input and output file names for hadd_stage2
        key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight)
        key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight)        
        key_hadd_stage2_job = getKey(lepton_selection_and_frWeight)
        if not key_hadd_stage2_job in self.inputFiles_hadd_stage2:
          self.inputFiles_hadd_stage2[key_hadd_stage2_job] = []
        if lepton_selection == "Tight":
          self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'])
          self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions]['outputFile'])        
        self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job])
        self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST],
                                                                        "hadd_stage2_%s.root" % lepton_selection_and_frWeight)

    if self.do_sync:
      if self.is_sbatch:
        logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
        self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
        self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      logging.info("Creating Makefile")
      lines_makefile = []
      self.addToMakefile_syncNtuple(lines_makefile)
      outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel)
      self.outputFile_sync['sync'] = outputFile_sync_path
      self.targets.append(outputFile_sync_path)
      self.addToMakefile_hadd_sync(lines_makefile)
      self.createMakefile(lines_makefile)
      logging.info("Done.")
      return self.num_jobs

    logging.info("Creating configuration files to run 'addBackgroundFakes'")
    key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled"))
    key_addFakes_job = getKey("fakes_data")
    category_sideband = "ttZctrl_Fakeable_wFakeRateWeights"
    self.jobOptions_addFakes[key_addFakes_job] = {
      'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
      'cfgFile_modified' : os.path.join(self.dirs[DKEY_CFGS], "addBackgroundLeptonFakes_cfg.py"),
      'outputFile' : os.path.join(self.dirs[DKEY_HIST], "addBackgroundLeptonFakes.root"),
      'logFile' : os.path.join(self.dirs[DKEY_LOGS], "addBackgroundLeptonFakes.log"),
      'category_signal' : "ttZctrl_Tight",
      'category_sideband' : category_sideband
    }
    self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job])
    key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"))
    self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile'])

    logging.info("Creating configuration files to run 'prepareDatacards'")
    for histogramToFit in self.histograms_to_fit:
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"))
      key_prep_dcard_dir = getKey("prepareDatacards")
      prep_dcard_job_tuple = (self.channel, histogramToFit)
      key_prep_dcard_job = getKey(histogramToFit)      
      self.jobOptions_prep_dcard[key_prep_dcard_job] = {
        'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple),
        'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple),
        'histogramDir' : self.histogramDir_prep_dcard,
        'histogramToFit' : histogramToFit,
        'label' : None
      }
      self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job])

      # add shape templates for the following systematic uncertainties:
      #  - 'CMS_ttHl_Clos_norm_e'
      #  - 'CMS_ttHl_Clos_shape_e'
      #  - 'CMS_ttHl_Clos_norm_m'
      #  - 'CMS_ttHl_Clos_shape_m'
      key_prep_dcard_job = getKey(histogramToFit)
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"))
      key_add_syst_fakerate_dir = getKey("addSystFakeRates")                                    
      add_syst_fakerate_job_tuple = (self.channel, histogramToFit) 
      key_add_syst_fakerate_job = getKey(histogramToFit)      
      self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = {
        'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'],
        'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_cfg.py" % add_syst_fakerate_job_tuple),
        'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s.root" % add_syst_fakerate_job_tuple),
        'category' : self.channel,
        'histogramToFit' : histogramToFit,
        'plots_outputFileName' : os.path.join(self.dirs[DKEY_PLOT], "addSystFakeRates.png")
      }
      histogramDir_nominal = self.histogramDir_prep_dcard
      for lepton_type in [ 'e', 'm' ]:
        lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type
        if lepton_mcClosure not in self.lepton_selections:
          continue
        lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled")
        key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight)
        histogramDir_mcClosure = self.mcClosure_dir[lepton_mcClosure]
        self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({
          'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections,
          'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job],
          'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit),
          'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'],
          'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit)
        })
      self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job])

    logging.info("Creating configuration files to run 'makePlots'")
    key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"))
    key_makePlots_dir = getKey("makePlots")                                       
    key_makePlots_job = getKey('')
    self.jobOptions_make_plots[key_makePlots_job] = {
      'executable' : self.executable_make_plots,
      'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
      'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel),
      'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel),
      'histogramDir' : self.histogramDir_prep_dcard,
      'label' : "t#bar{t}Z control region",
      'make_plots_backgrounds' : self.make_plots_backgrounds
    }
    self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job])

    if self.is_sbatch:
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
      self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
      self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds)
      self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel)
      self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds)
      self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel)
      self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes)
      self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel)
      self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes)

    logging.info("Creating Makefile")
    lines_makefile = []
    self.addToMakefile_analyze(lines_makefile)
    self.addToMakefile_hadd_stage1(lines_makefile)
    self.addToMakefile_backgrounds_from_data(lines_makefile)
    self.addToMakefile_hadd_stage2(lines_makefile)
    self.addToMakefile_prep_dcard(lines_makefile)
    self.addToMakefile_add_syst_fakerate(lines_makefile)
    self.addToMakefile_make_plots(lines_makefile)
    self.createMakefile(lines_makefile)

    logging.info("Done.")

    return self.num_jobs
Esempio n. 26
0
process_name = args.process
category_name = args.category
binning_choice = args.binning
genweight_ref = args.gen_weight

use_genweight = genweight_ref != 0.

if args.verbose:
    logging.getLogger().setLevel(logging.DEBUG)

if not os.path.isfile(input_fn):
    raise ValueError("No such file: %s" % input_fn)

output_dir = os.path.dirname(output_fn)
if not os.path.isdir(output_dir):
    logging.warning("Creating directory: {}".format(output_dir))
    os.makedirs(output_dir)

input_file = ROOT.TFile.Open(input_fn, 'read')
assert (input_file)
input_tree = input_file.Get('Events')

denominator_process = Hist2D(BINNING_MHH[binning_choice],
                             BINNING_COSTHETASTAR,
                             name=process_name)
denominator_category = Hist2D(BINNING_MHH[binning_choice],
                              BINNING_COSTHETASTAR,
                              name=category_name)

nof_events = input_tree.GetEntries()
logging.debug("Input file {} has {} events".format(input_fn, nof_events))
Esempio n. 27
0
    raise ValueError("Invalid mode: %s" % mode)

for sample_name, sample_info in samples.items():
    if sample_name == 'sum_events': continue
    if sample_name.startswith(("/DoubleEG/", "/DoubleMuon/", "/MuonEG/")):
        sample_info["use_it"] = False
    elif sample_name.startswith("/Tau/"):
        sample_info["use_it"] = True

if __name__ == '__main__':
    logging.info(
      "Running the jobs with the following systematic uncertainties enabled: %s" % \
      ', '.join(central_or_shifts)
    )
    if not use_preselected:
        logging.warning('Running the analysis on fully inclusive samples!')

    if sample_filter:
        samples = filter_samples(samples, sample_filter)

    if args.tau_id_wp:
        logging.info("Changing tau ID working point: %s -> %s" %
                     (hadTau_selection, args.tau_id_wp))
        hadTau_selection = args.tau_id_wp

    analysis = analyzeConfig_hh_1l_3tau(
        configDir=os.path.join("/scratch-persistent", getpass.getuser(),
                               "hhAnalysis", era, version),
        localDir=os.path.join("/home", getpass.getuser(), "hhAnalysis", era,
                              version),
        outputDir=os.path.join("/hdfs/local", getpass.getuser(), "hhAnalysis",
Esempio n. 28
0
    def create(self):
        """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system
    """

        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue
            process_name = sample_info["process_name_specific"]
            key_dir = getKey(process_name)
            for dir_type in [DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES]:
                initDict(self.dirs, [key_dir, dir_type])
                if dir_type in [DKEY_CFGS, DKEY_LOGS]:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.configDir, dir_type, self.channel, process_name)
                else:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.outputDir, dir_type, self.channel, process_name)
        for dir_type in [
                DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD,
                DKEY_PLOT, DKEY_HADD_RT
        ]:
            initDict(self.dirs, [dir_type])
            if dir_type in [
                    DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT,
                    DKEY_HADD_RT
            ]:
                self.dirs[dir_type] = os.path.join(self.configDir, dir_type,
                                                   self.channel)
            else:
                self.dirs[dir_type] = os.path.join(self.outputDir, dir_type,
                                                   self.channel)

        numDirectories = 0
        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                numDirectories += len(self.dirs[key])
            else:
                numDirectories += 1
        logging.info("Creating directory structure (numDirectories = %i)" %
                     numDirectories)
        numDirectories_created = 0
        frac = 1
        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
                numDirectories_created += len(self.dirs[key])
            else:
                create_if_not_exists(self.dirs[key])
                numDirectories_created = numDirectories_created + 1
            while 100 * numDirectories_created >= frac * numDirectories:
                logging.info(" %i%% completed" % frac)
                frac = frac + 1
        logging.info("Done.")

        inputFileLists = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue
            logging.info("Checking input files for sample %s" %
                         sample_info["process_name_specific"])
            inputFileLists[sample_name] = generateInputFileList(
                sample_info, self.max_files_per_job)

        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue
            process_name = sample_info["process_name_specific"]
            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable_analyze, process_name))

            inputFileList = inputFileLists[sample_name]
            for jobId in inputFileList.keys():
                ##print "processing sample %s: jobId = %i" % (process_name, jobId)

                # build config files for executing analysis code
                key_analyze_dir = getKey(process_name)
                analyze_job_tuple = (process_name, jobId)
                key_analyze_job = getKey(*analyze_job_tuple)
                ntupleFiles = inputFileList[jobId]
                if len(ntupleFiles) == 0:
                    logging.warning(
                        "No input ntuples for %s --> skipping job !!" %
                        (key_analyze_job))
                    continue

                cfgFile_modified_path = os.path.join(
                    self.dirs[key_analyze_dir][DKEY_CFGS],
                    "analyze_%s_%i_cfg.py" % analyze_job_tuple)
                logFile_path = os.path.join(
                    self.dirs[key_analyze_dir][DKEY_LOGS],
                    "analyze_%s_%i.log" % analyze_job_tuple)
                histogramFile_path = os.path.join(
                    self.dirs[key_analyze_dir][DKEY_HIST],
                    "analyze_%s_%i.root" % analyze_job_tuple)

                self.jobOptions_analyze[key_analyze_job] = {
                    'ntupleFiles': ntupleFiles,
                    'cfgFile_modified': cfgFile_modified_path,
                    'histogramFile': histogramFile_path,
                    'histogramDir': 'analyze_hadTopTagger',
                    'logFile': logFile_path,
                    'hadTauSelection': self.hadTau_selection,
                    'lumiScale': 1.,
                    'selectBDT': True,
                }
                self.createCfg_analyze(
                    self.jobOptions_analyze[key_analyze_job], sample_info)

                # initialize input and output file names for hadd_stage1
                key_hadd_stage1_dir = getKey(process_name,
                                             lepton_selection_and_frWeight)
                key_hadd_stage1_job = getKey(process_name)
                if not key_hadd_stage1_job in self.inputFiles_hadd_stage1:
                    self.inputFiles_hadd_stage1[key_hadd_stage1_job] = []
                self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(
                    self.jobOptions_analyze[key_analyze_job]['histogramFile'])
                self.outputFile_hadd_stage1[
                    key_hadd_stage1_job] = os.path.join(
                        self.dirs[key_hadd_stage1_dir][DKEY_HIST],
                        "hadd_stage1_%s.root" % process_name)
                self.targets.append(
                    self.outputFile_hadd_stage1[key_hadd_stage1_job])

        self.sbatchFile_analyze = os.path.join(
            self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable_analyze)
            self.createScript_sbatch_analyze(self.executable_analyze,
                                             self.sbatchFile_analyze,
                                             self.jobOptions_analyze)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_analyze(lines_makefile)
        self.addToMakefile_hadd_stage1(lines_makefile)
        self.createMakefile(lines_makefile)

        logging.info("Done.")

        return self.num_jobs
Esempio n. 29
0
    def create(self):
        """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue

            process_name = sample_info["process_name_specific"]
            is_mc = (sample_info["type"] == "mc")

            if not is_mc:
                continue

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable, process_name))

            inputFileList = generateInputFileList(sample_info,
                                                  self.max_files_per_job)
            key_dir = getKey(process_name)

            outputFile = os.path.join(self.dirs[key_dir][DKEY_HISTO],
                                      "%s.root" % process_name)
            self.outputFiles[process_name] = {
                'inputFiles': [],
                'outputFile': outputFile,
            }
            if os.path.isfile(outputFile) and tools_is_file_ok(
                    outputFile, min_file_size=2000):
                logging.info('File {} already exists --> skipping job'.format(
                    outputFile))
                continue

            for jobId in inputFileList.keys():

                key_file = getKey(sample_name, jobId)

                self.inputFiles[key_file] = inputFileList[jobId]
                if len(self.inputFiles[key_file]) == 0:
                    logging.warning("'%s' = %s --> skipping job !!" %
                                    (key_file, self.inputFiles[key_file]))
                    continue

                self.cfgFiles_projection[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "project_%s_%i_cfg.txt" % (process_name, jobId))
                self.outputFiles_tmp[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_HISTO_TMP],
                    "histogram_%i.root" % jobId)
                self.logFiles_projection[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_LOGS],
                    "project_%s_%i.log" % (process_name, jobId))
                self.scriptFiles_projection[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "project_%s_%i_cfg.sh" % (process_name, jobId))
                projection_module = self.projection_module
                if projection_module == "count":
                    projection_module = "countHistogramAll"
                    if sample_name.startswith('/TTTo'):
                        projection_module += "CompTopRwgt"
                    elif sample_info['sample_category'].startswith('ttH'):
                        projection_module += "CompHTXS"
                    elif isSplitByNlheJet(process_name):
                        projection_module += "SplitByLHENjet"
                    elif isSplitByNlheHT(process_name):
                        projection_module += "SplitByLHEHT"
                    elif isSplitByNlheJetHT(process_name, sample_name):
                        projection_module += "SplitByLHENjetHT"
                self.jobOptions_sbatch[key_file] = {
                    'histName': process_name,
                    'inputFiles': self.inputFiles[key_file],
                    'cfgFile_path': self.cfgFiles_projection[key_file],
                    'outputFile': self.outputFiles_tmp[key_file],
                    'logFile': self.logFiles_projection[key_file],
                    'scriptFile': self.scriptFiles_projection[key_file],
                    'projection_module': projection_module,
                }
                if self.projection_module != 'puHist':
                    self.jobOptions_sbatch[key_file][
                        'ref_genWeight'] = self.ref_genWeights[process_name]
                    if process_name not in self.ref_genWeights:
                        raise RuntimeError(
                            "Unable to find reference LHE weight for process %s"
                            % process_name)
                self.createCfg_project(self.jobOptions_sbatch[key_file])
                self.outputFiles[process_name]['inputFiles'].append(
                    self.outputFiles_tmp[key_file])

        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable)
            self.num_jobs['project'] += self.createScript_sbatch(
                self.executable, self.sbatchFile_projection,
                self.jobOptions_sbatch)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_project(lines_makefile)
        self.addToMakefile_hadd(lines_makefile)
        if self.plot:
            self.addToMakefile_plot(lines_makefile)
        self.addToMakefile_finalHadd(lines_makefile)
        self.createMakefile(lines_makefile)
        logging.info("Done")

        return self.num_jobs
Esempio n. 30
0
def run_brilcalc(hlt_paths_in, json, normtag, units, brilcalc_path, data_file,
                 output_dir):

    assert (all(map(lambda hlt_path: hlt_path.startswith('HLT'),
                    hlt_paths_in)))
    hlt_paths = {hlt_path: hlt_version(hlt_path) for hlt_path in hlt_paths_in}

    for input_file in (json, normtag):
        if input_file and not os.path.isfile(input_file):
            raise ValueError("No such file: %s" % input_file)
    if not os.path.isfile(brilcalc_path):
        raise ValueError("No such file: %s" % brilcalc_path)

    if data_file:
        data = parse_data(data_file)
        if data['normtag'] != os.path.basename(normtag):
            logging.warning(
                "File {} is generated with normtag '{}' but requested using normtag '{}'"
                .format(data_file, data['normtag'], normtag))
        if data['json'] != os.path.basename(json):
            logging.warning(
                "File {} is generated with JSON '{}' but requested using JSON '{}'"
                .format(data_file, data['json'], json))
    else:
        data = None

    if output_dir and not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    # prepare the jobs
    pool_size = 16
    pool = multiprocessing.Pool(pool_size, handle_worker)

    logging.debug("Constructing pool for {} HLT paths".format(len(hlt_paths)))
    for hlt_path in hlt_paths:
        pool.apply_async(
            process_hlt,
            args=(hlt_paths[hlt_path], json, brilcalc_path, normtag, units,
                  output_dir),
            callback=get_trigger_results,
        )
    pool.close()
    pool.join()
    logging.debug("Pool finished")

    # parse trigger_results
    for hlt_path in hlt_paths:
        dict_entry = trigger_results[hlt_paths[hlt_path]]

        if data_file:
            present_eras = []
            for run in dict_entry['runs']:
                for era in data['runs']:
                    if data['runs'][era]['run_start'] <= run <= data['runs'][
                            era]['run_end'] and era not in present_eras:
                        present_eras.append(era)
            all_eras = [era for era in data['runs']]
            missing_eras = list(sorted(list(set(all_eras) -
                                            set(present_eras))))

            expected_recording = data['totrecorded']
            expected_delivery = data['totdelivered']

            data_units = data['units']
            unit_factor = 1000**(LUMI_UNITS.index(units) -
                                 LUMI_UNITS.index(data_units))
            expected_recording *= unit_factor
            expected_delivery *= unit_factor

            prescale_recording = (expected_recording / dict_entry['recorded']
                                  ) if dict_entry['recorded'] != 0. else -1.
            prescale_delivery = (expected_delivery / dict_entry['delivered']
                                 ) if dict_entry['delivered'] != 0. else -1.

            if int(prescale_recording) == 1:
                prescale_msg = "NOT prescaled"
            elif int(prescale_recording) == -1:
                prescale_msg = "NOT recording anything?"
            else:
                prescale_msg = "prescale factor %.1f (%.1f from delivery)" % (
                    prescale_recording, prescale_delivery)
            prescale_msg += " (expected %.1f recorded, %.1f delivery; units = %s)" % (
                expected_recording, expected_delivery, units)

        print("{} nrun = {} totdelivered = {} totrecorded = {} (units = {})".
              format(
                  hlt_path,
                  len(dict_entry['runs']),
                  dict_entry['delivered'],
                  dict_entry['recorded'],
                  units,
              ))
        if data_file:
            print("{} present in eras: {} (missing in {} eras) => {}".format(
                hlt_path,
                ", ".join(present_eras),
                ", ".join(missing_eras) if missing_eras else "none of the",
                prescale_msg,
            ))
        for hlt_dict in dict_entry['paths']:
            print(
                "\t{} nfill = {} nrun = {} ncms = {} totdelivered = {} totrecorded = {}"
                .format(
                    hlt_dict['hltpath'],
                    hlt_dict['nfill'],
                    hlt_dict['nrun'],
                    hlt_dict['ncms'],
                    hlt_dict['totdelivered'],
                    hlt_dict['totrecorded'],
                ))