def createCfg_addBackgrounds_Convs_LeptonFakeRate(self, jobOptions):
   lines = []
   lines.append("process.fwliteInput.fileNames = cms.vstring('%s')" % jobOptions['inputFile'])
   lines.append("process.fwliteOutput.fileName = cms.string('%s')" % os.path.basename(jobOptions['outputFile']))
   lines.append("process.addBackground_LeptonFakeRate.processData = cms.string('%s')" % self.processToSubtractConvsFrom)
   lines.append("process.addBackground_LeptonFakeRate.processLeptonFakes = cms.string('%s_NC')" % self.processToSubtractConvsFrom)
   lines.append("process.addBackground_LeptonFakeRate.processesToSubtract = cms.vstring('%sg')" % self.processToSubtractConvsFrom)
   # lines.append("process.addBackground_LeptonFakeRate.sysShifts = cms.vstring()" % self.central_or_shifts)
   logging.info("self.cfgFile_addBackgrounds_Convs_LeptonFakeRate => %s" % self.cfgFile_addBackgrounds_Convs_LeptonFakeRate)
   logging.info("jobOptions['cfgFile_modified'] => %s" % jobOptions['cfgFile_modified'])
   create_cfg(self.cfgFile_addBackgrounds_Convs_LeptonFakeRate, jobOptions['cfgFile_modified'], lines)
 def createMakefile(self, lines_makefile):
     """Creates Makefile that runs the PU profile production.
     """
     tools_createMakefile(makefileName=self.makefile,
                          targets=self.targets,
                          lines_makefile=lines_makefile,
                          filesToClean=self.filesToClean,
                          isSbatch=self.is_sbatch,
                          phoniesToAdd=self.phoniesToAdd)
     logging.info("Run it with:\tmake -f %s -j %i " %
                  (self.makefile, self.num_parallel_jobs))
 def createMakefile(self, lines_makefile):
     """Creates Makefile that runs the Ntuple production.
     """
     targets = None
     if self.is_sbatch:
         targets = [MAKEFILE_TARGET]
     else:
         targets = self.outputFiles.values()
     tools_createMakefile(self.makefile, targets, lines_makefile,
                          self.filesToClean)
     logging.info("Run it with:\tmake -f %s -j %i " %
                  (self.makefile, self.num_parallel_jobs))
Example #4
0
    def create(self):
        create_if_not_exists(self.hadd_log_dir_path)

        if self.running_method.lower() == 'sbatch':
            create_if_not_exists(self.hadd_script_dir_path)

            createScript_sbatch_hadd(
                sbatch_script_file_name=self.hadd_script_path,
                input_file_names=list(self.channel_info.keys()),
                output_file_name=self.final_output_file,
                script_file_name=self.hadd_script_path.replace('.py', '.sh'),
                log_file_name=self.
                hadd_log_executable_path,  # the basename will be replaced anyways?
                working_dir=None,
                waitForJobs=True,
                auxDirName='',
                pool_id=uuid.uuid4(),
                verbose=False,
                max_input_files_per_job=len(self.channel_info),
                dry_run=self.dry_run,
                use_home=self.use_home,
                min_file_size=-1,
            )

            logging.info("Generated hadd config file: %s" %
                         self.hadd_script_path)
            self.hadd_script_path = 'python %s' % self.hadd_script_path
            additional_cmds = ''

        else:
            self.hadd_script_path = 'hadd -f {} {}'.format(
                os.path.basename(self.final_output_file),
                ' '.join(list(self.channel_info.keys())))
            additional_cmds = 'mv {} {}'.format(
                os.path.basename(self.final_output_file),
                self.final_output_file)

        with open(self.makefile_path, 'w') as makefile:
            hadd_script_cmd = '{}{}'.format(
                'sleep 60\n\t' if self.running_method.lower() == 'makefile'
                else '', self.hadd_script_path)
            makeFileContents = jinja2.Template(makeFileTemplate).render(
                output_file=self.final_output_file,
                channel_info=self.channel_info,
                hadd_script=hadd_script_cmd,
                hadd_wrapper_log=self.hadd_log_wrapper_path,
                additional_cmds=additional_cmds,
                validate_channels=' '.join(self.channels_to_validate),
                output_dir=self.output_dir,
            )
            makefile.write(makeFileContents)
        logging.info("Created the makefile: %s" % self.makefile_path)
def get_paths(input_paths, whitelist, blacklist):
    valid_paths = {}
    for input_path in input_paths:
        input_path_split = [
            subpath for subpath in input_path.split(os.path.sep)
            if subpath != ''
        ]
        nof_levels = len(input_path_split)
        if nof_levels == 6:
            input_path_subdir = os.path.join(input_path, OUTPUT_RLE)
            if not hdfs.isdir(input_path_subdir):
                raise ValueError("No such directory: %s" % input_path_subdir)
            for channel_dir in sorted(hdfs.listdir(input_path_subdir)):
                channel_name = os.path.basename(channel_dir)
                if whitelist and channel_name not in whitelist:
                    logging.info("Excluding channel: {}".format(channel_name))
                    continue
                if channel_name in blacklist:
                    logging.info("Excluding channel: {}".format(channel_name))
                    continue
                if channel_name in valid_paths:
                    raise ValueError(
                        "Found duplicate paths for the same channel: %s and %s"
                        % (valid_paths[channel_name], input_path))
                logging.debug('Found channel {} at path {}'.format(
                    channel_name, channel_dir))
                valid_paths[channel_name] = channel_dir
        elif nof_levels == 8:
            if input_path_split[-2] != OUTPUT_RLE:
                raise ValueError("Invalid path: %s" % input_path)
            channel_name = input_path_split[-1]
            if whitelist and channel_name not in whitelist:
                raise ValueError("Path %s conflicting with whitelist: %s" %
                                 (input_path, ', '.join(whitelist)))
            if channel_name in blacklist:
                raise ValueError("Path %s conflicting with blacklist: %s" %
                                 (input_path, ', '.join(blacklist)))
            if channel_name in valid_paths:
                raise ValueError(
                    "Found duplicate paths for the same channel: %s and %s" %
                    (valid_paths[channel_name], input_path))
            logging.debug('Found channel {} at path {}'.format(
                channel_name, input_path))
            valid_paths[channel_name] = input_path
        else:
            raise ValueError("Invalid path: %s" % input_path)
    assert (len(set(valid_paths.values())) == len(valid_paths))
    return valid_paths
def validate_channels(rles):
    validation_set = collections.OrderedDict()
    for channel in rles:
        for region in rles[channel]:
            if 'Tight' not in region:
                continue
            for sample_name in rles[channel][region]:
                if sample_name not in validation_set:
                    validation_set[sample_name] = collections.OrderedDict()
                for central_or_shift in rles[channel][region][sample_name]:
                    if central_or_shift not in validation_set[sample_name]:
                        validation_set[sample_name][
                            central_or_shift] = collections.OrderedDict()
                    for rle in rles[channel][region][sample_name][
                            central_or_shift]:
                        if rle not in validation_set[sample_name][
                                central_or_shift]:
                            validation_set[sample_name][central_or_shift][
                                rle] = collections.OrderedDict()
                        validation_set[sample_name][central_or_shift][rle][
                            channel] = region
    has_errors = False
    for sample_name in validation_set:
        for central_or_shift in validation_set[sample_name]:
            for rle in validation_set[sample_name][central_or_shift]:
                if len(validation_set[sample_name][central_or_shift][rle]) > 1:
                    if '2los_1tau' in validation_set[sample_name][central_or_shift][rle]           and \
                        validation_set[sample_name][central_or_shift][rle]['2los_1tau'] == 'Tight' and \
                       '2lss_1tau' in validation_set[sample_name][central_or_shift][rle]           and \
                        validation_set[sample_name][central_or_shift][rle]['2lss_1tau'] == 'Tight_OS_OS':
                        continue
                    logging.error(
                        "Found the same event {} from sample {} in multiple channels: {}"
                        .format(
                            rle, sample_name, ', '.join([
                                '%s (region %s, systematics %s)' %
                                (channel, validation_set[sample_name]
                                 [central_or_shift][rle][channel],
                                 central_or_shift)
                                for channel in validation_set[sample_name]
                                [central_or_shift][rle]
                            ])))
                    has_errors = True
    if not has_errors:
        logging.info(
            "No overlaps found between the signal regions of channels: {}".
            format(', '.join(rles.keys())))
    return has_errors
Example #7
0
def is_file_ok(output_file_name, validate_outputs=True, min_file_size=20000):
    if not (output_file_name and os.path.exists(output_file_name)):
        return False

    logging.info("Output file %s already exists" % output_file_name)

    if not output_file_name.lower().endswith('.root'):
        return True

    command = "rm %s" % output_file_name
    ret_value = False
    if min_file_size > 0:
        output_file_size = os.stat(output_file_name).st_size
        if output_file_size > min_file_size:
            if not validate_outputs:
                ret_value = True
        else:
            logging.info(
                "Deleting output file and resubmitting job because it has size smaller than %d bytes"
                % min_file_size)

    if validate_outputs:
        root_tfile = ROOT.TFile(output_file_name, "read")
        if not root_tfile:
            logging.info("Not a valid ROOT file, deleting it")
        else:
            if root_tfile.IsZombie():
                logging.info(
                    "Output file is corrupted, deleting file and resubmitting job"
                )
            else:
                # Let's open the file via bash as well to see if ROOT tries to recover the file
                open_cmd = "root -b -l -q %s 2>&1 > /dev/null | grep 'trying to recover' | wc -l" % output_file_name
                open_out = run_cmd(open_cmd)
                if open_out.rstrip('\n') != '0':
                    logging.info(
                        "Output file is probably corrupted, deleting file and resubmitting job"
                    )
                else:
                    ret_value = True
            root_tfile.Close()

    if not ret_value:
        run_cmd(command)

    return ret_value
Example #8
0
    def __init__(self, fn, sf):
        self.fn = fn
        logging.info("Scaling histograms in file {} ({}) by SF {}".format(
            self.fn, md5(self.fn), sf))

        fptr = ROOT.TFile.Open(self.fn, 'read')
        keys = [key.GetName() for key in fptr.GetListOfKeys()]
        self.histograms = {}
        for key in keys:
            histogram = fptr.Get(key).Clone()
            histogram.SetDirectory(0)
            assert (type(histogram) == ROOT.TH2D)
            assert (key not in self.histograms)
            histogram.Scale(sf)
            logging.info("Found histogram {} in file {}".format(key, self.fn))
            self.histograms[key] = histogram

        fptr.Close()
  def run(self, clean):
    record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out, DEPENDENCIES)
    target = 'all'
    if clean:
      if not os.path.isfile(self.makefile_path):
        logging.error(
          "The makefile %s is missing and therefore it's not possible to clean anything; "
          "run sync Ntuple production first!" % self.makefile_path
        )
        sys.exit(1)
      target = 'clean'

    nof_parallel_jobs = len(self.channel_info)
    make_cmd          = "make -f %s -j %d %s 2>%s 1>%s" % \
      (self.makefile_path, nof_parallel_jobs, target, self.stderr_file_path, self.stdout_file_path)
    logging.info("Running the make command: %s" % make_cmd)
    run_cmd(make_cmd)
    logging.info("All done")
Example #10
0
def copy_dirs(fn, fo):
    root_keys = get_evt_subdir_names(fn)

    for root_key in root_keys:
        fptr = ROOT.TFile.Open(fn, 'read')
        logging.info('Opened file {} to copy {}'.format(
            fptr.GetName(), root_key))
        for evt_subdir_name in root_keys[root_key]:
            evt_dir_key = os.path.basename(evt_subdir_name)
            evt_subdir = fptr.Get(evt_subdir_name)
            evt_subdir.ReadAll()
            out_key = os.path.join(root_key, evt_dir_key)
            fo.mkdir(out_key)
            fo.cd(out_key)
            evt_subdir.GetList().Write()
        logging.info('Closing file {} after copying {}'.format(
            fptr.GetName(), root_key))
        fptr.Close()
 def createCfg_addBackgrounds_LeptonFakeRate(self, jobOptions):
   """Create python configuration file for the addBackgrounds executable (sum either all "fake" or all "non-fake" contributions)
   Args:
     inputFiles: input file (the ROOT file produced by hadd_stage1)
     outputFile: output file of the job
   """
   lines = []
   lines.append("process.fwliteInput.fileNames = cms.vstring('%s')" % jobOptions['inputFile'])
   lines.append("process.fwliteOutput.fileName = cms.string('%s')" % os.path.basename(jobOptions['outputFile']))
   # if self.use_QCD_fromMC:
   #   lines.append("process.addBackground_LeptonFakeRate.processData = cms.string('%s')" % "QCD")
   #   lines.append("process.addBackground_LeptonFakeRate.processLeptonFakes = cms.string('%s')" % "QCD")
   #   lines.append("process.addBackground_LeptonFakeRate.processesToSubtract = cms.vstring()")
   # lines.append("process.addBackgrounds.categories = cms.vstring(%s)" % jobOptions['categories'])
   # lines.append("process.addBackgrounds.processes_input = cms.vstring(%s)" % jobOptions['processes_input'])
   # lines.append("process.addBackgrounds.process_output = cms.string('%s')" % jobOptions['process_output'])
   logging.info("self.cfgFile_addBackgrounds_LeptonFakeRate => %s" % self.cfgFile_addBackgrounds_LeptonFakeRate)
   logging.info("jobOptions['cfgFile_modified'] => %s" % jobOptions['cfgFile_modified'])
   create_cfg(self.cfgFile_addBackgrounds_LeptonFakeRate, jobOptions['cfgFile_modified'], lines)
Example #12
0
def filter_samples(sample, condition, force = False):
  key = condition[0]
  regex = condition[1]

  sample_key = ALLOWED_CONDITION_KEYS[key]
  for sample_name, sample_entry in sample.items():
    if sample_name == 'sum_events': continue
    if sample_key == 'path':
      use_it = bool(regex.match(sample_entry['local_paths'][0]['path']))
    else:
      use_it = bool(regex.match(sample_entry[sample_key]))
    if force:
      sample_entry['use_it'] = use_it
    else:
      sample_entry['use_it'] &= use_it
    logging_str = 'Enabling' if sample_entry['use_it'] else 'Disabling'
    logging.info('%s sample %s' % (logging_str, sample_entry[ALLOWED_CONDITION_KEYS['name']]))

  return sample
Example #13
0
def validate_regions(rles):
    has_errors = False
    for channel in rles:
        validation_set = collections.OrderedDict()
        for region in rles[channel]:
            if 'Fakeable_mcClosure' in region:
                continue
            for sample_name in rles[channel][region]:
                if sample_name not in validation_set:
                    validation_set[sample_name] = collections.OrderedDict()
                for central_or_shift in rles[channel][region][sample_name]:
                    if central_or_shift not in validation_set[sample_name]:
                        validation_set[sample_name][
                            central_or_shift] = collections.OrderedDict()
                    for rle in rles[channel][region][sample_name][
                            central_or_shift]:
                        if rle not in validation_set[sample_name][
                                central_or_shift]:
                            validation_set[sample_name][central_or_shift][
                                rle] = []
                        if region not in validation_set[sample_name][
                                central_or_shift][rle]:
                            validation_set[sample_name][central_or_shift][
                                rle].append(region)
        for sample_name in validation_set:
            has_errors_sample = False
            for central_or_shift in validation_set[sample_name]:
                for rle in validation_set[sample_name][central_or_shift]:
                    if len(validation_set[sample_name][central_or_shift]
                           [rle]) > 1:
                        logging.error(
                            "Found duplicates in channel {} and sample {} for event {}: regions {}"
                            .format(
                                channel, sample_name, rle,
                                ', '.join(validation_set[sample_name]
                                          [central_or_shift][rle])))
                        has_errors_sample = True
            if not has_errors_sample:
                logging.info(
                    'No overlapping events found between regions for sample {} in channel {}'
                    .format(sample_name, channel))
            has_errors = has_errors or has_errors_sample
    return has_errors
Example #14
0
def record_weights(file_name):
    fptr = ROOT.TFile.Open(file_name, 'read')
    tree = fptr.Get('Events')

    genWeight = array.array('f', [0.])
    tree.SetBranchAddress(GENWEIGHT_NAME, genWeight)

    tree.SetBranchStatus("*", 0)
    tree.SetBranchStatus(GENWEIGHT_NAME, 1)

    nof_events = tree.GetEntries()
    logging.info("Processing {} events from file {}".format(
        nof_events, file_name))
    for event_idx in range(nof_events):
        tree.GetEntry(event_idx)
        genWeight_val = genWeight[0]
        if genWeight_val not in weights_map:
            weights_map[genWeight_val] = 0
        weights_map[genWeight_val] += 1
    fptr.Close()
Example #15
0
    def run(self, clean):
        record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out,
                              DEPENDENCIES)
        target = 'all'
        if clean:
            if not os.path.isfile(self.makefile_path):
                logging.error(
                    "The makefile %s is missing and therefore it's not possible to clean anything; "
                    "run sync Ntuple production first!" % self.makefile_path)
                sys.exit(1)
            target = 'clean'

        nof_parallel_jobs = len(self.channel_info)
        make_cmd = "make -f %s -j %d %s 2>%s 1>%s" % \
          (self.makefile_path, nof_parallel_jobs, target, self.stderr_file_path, self.stdout_file_path)
        if self.running_method.lower() == "makefile":
            run_dir = re.sub('^/home', '/scratch', self.config_dir)
            create_if_not_exists(run_dir)
            make_cmd = re.sub('^make', 'make -C {}'.format(run_dir), make_cmd)
        logging.info("Running the make command: %s" % make_cmd)
        run_cmd(make_cmd)
        logging.info("All done")
Example #16
0
def validate_data(rles):
    has_errors = False
    for channel in rles:
        validation_set = collections.OrderedDict()
        has_errors_channel = False
        for region in rles[channel]:
            validation_set[region] = collections.OrderedDict()
            for sample_name in rles[channel][region]:
                if 'Run201' not in sample_name:
                    continue
                for central_or_shift in rles[channel][region][sample_name]:
                    for rle in rles[channel][region][sample_name][
                            central_or_shift]:
                        if rle not in validation_set[region]:
                            validation_set[region][
                                rle] = collections.OrderedDict()
                        if sample_name in validation_set[region][rle]:
                            validation_set[region][rle][sample_name].append(
                                central_or_shift)
                        else:
                            if validation_set[region][rle]:
                                logging.error(
                                    "Found duplicates in channel {} and region {} for event {}: samples {} and {}"
                                    .format(
                                        channel, region, rle, sample_name,
                                        ', '.join(validation_set[region]
                                                  [rle].keys())))
                                has_errors_channel = True
                            validation_set[region][rle][sample_name] = [
                                central_or_shift
                            ]
            if not has_errors_channel:
                logging.info(
                    'No overlapping data events found in channel {} and region {}'
                    .format(channel, region))
            has_errors = has_errors or has_errors_channel
    return has_errors
Example #17
0
def validate_pu(output_file, samples):
    error_code = 0
    if not os.path.isfile(output_file):
        logging.error('File {} does not exist'.format(output_file))
        return 1
    histogram_file = ROOT.TFile.Open(output_file, 'read')
    if not histogram_file:
        logging.error('Not a valid ROOT file: {}'.format(output_file))
        return 2
    for sample_name, sample_info in samples.items():
        is_mc = (sample_info["type"] == "mc")
        if not is_mc:
            continue
        process_name = sample_info["process_name_specific"]
        expected_nof_events = sample_info["nof_tree_events"]
        logging.info('Validating {} (expecting {} events)'.format(
            process_name, expected_nof_events))
        histogram = histogram_file.Get(process_name)
        if not histogram:
            logging.error("Could not find histogram '{}' in file {}".format(
                process_name, output_file))
            error_code = 3
            continue
        nof_events = int(histogram.GetEntries())
        if nof_events != expected_nof_events:
            logging.error(
                'Histogram {} in file {} has {} events, but expected {} events'
                .format(
                    process_name,
                    output_file,
                    nof_events,
                    expected_nof_events,
                ))
            error_code = 4
        else:
            logging.info(
                'Validation successful for sample {}'.format(process_name))
    histogram_file.Close()
    if error_code == 0:
        logging.info("Validation successful!")
    else:
        logging.error("Validation failed!")
    return error_code
Example #18
0
    def __init__(
        self,
        configDir,
        outputDir,
        cfgFile_prodNtuple,
        samples,
        max_files_per_job,
        era,
        preselection_cuts,
        leptonSelection,
        hadTauWP,
        check_output_files,
        running_method,
        version,
        num_parallel_jobs,
        pileup,
        golden_json,
        dry_run,
        isDebug,
        gen_matching_by_index,
        use_nonnominal,
        use_home,
        skip_tools_step,
        verbose=False,
        pool_id='',
    ):

        self.configDir = configDir
        self.outputDir = outputDir
        self.max_num_jobs = 200000
        self.samples = samples
        self.max_files_per_job = max_files_per_job
        self.era = era
        self.preselection_cuts = preselection_cuts
        self.leptonSelection = leptonSelection
        self.hadTauWP = hadTauWP
        self.check_output_files = check_output_files
        self.verbose = verbose
        self.dry_run = dry_run
        self.isDebug = isDebug
        self.gen_matching_by_index = gen_matching_by_index
        self.use_nonnominal = use_nonnominal
        self.use_home = use_home
        self.pileup = pileup
        self.golden_json = golden_json
        if running_method.lower() not in ["sbatch", "makefile"]:
            raise ValueError("Invalid running method: %s" % running_method)

        if not os.path.isfile(self.pileup):
            raise ValueError('No such file: %s' % self.pileup)
        self.pileup_histograms = get_pileup_histograms(self.pileup)

        if not os.path.isfile(self.golden_json):
            raise ValueError('No such file: %s' % self.golden_json)

        self.running_method = running_method
        self.is_sbatch = self.running_method.lower() == "sbatch"
        self.is_makefile = not self.is_sbatch
        self.makefile = os.path.join(self.configDir, "Makefile_prodNtuple")
        self.num_parallel_jobs = num_parallel_jobs
        self.skip_tools_step = skip_tools_step
        self.pool_id = pool_id if pool_id else uuid.uuid4()

        self.workingDir = os.getcwd()
        logging.info("Working directory is: %s" % self.workingDir)
        self.template_dir = os.path.join(os.getenv('CMSSW_BASE'), 'src',
                                         'tthAnalysis', 'HiggsToTauTau',
                                         'test', 'templates')
        logging.info("Templates directory is: %s" % self.template_dir)

        self.version = version
        self.samples = samples

        create_if_not_exists(self.configDir)
        create_if_not_exists(self.outputDir)
        self.stdout_file_path = os.path.join(self.configDir,
                                             "stdout_prodNtuple.log")
        self.stderr_file_path = os.path.join(self.configDir,
                                             "stderr_prodNtuple.log")
        self.sw_ver_file_cfg = os.path.join(self.configDir,
                                            "VERSION_prodNtuple.log")
        self.sw_ver_file_out = os.path.join(self.outputDir,
                                            "VERSION_prodNtuple.log")
        self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out = get_log_version(
            (self.stdout_file_path, self.stderr_file_path,
             self.sw_ver_file_cfg, self.sw_ver_file_out))

        self.cfgFile_prodNtuple_original = os.path.join(
            self.template_dir, cfgFile_prodNtuple)
        self.sbatchFile_prodNtuple = os.path.join(self.configDir,
                                                  "sbatch_prodNtuple.py")
        self.cfgFiles_prodNtuple_modified = {}
        self.logFiles_prodNtuple = {}

        self.inputFiles = {}
        self.outputFiles = {}
        self.filesToClean = []
        self.dirs = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue
            process_name = sample_info["process_name_specific"]
            key_dir = getKey(sample_name)
            for dir_type in [DKEY_CFGS, DKEY_NTUPLES, DKEY_LOGS]:
                initDict(self.dirs, [key_dir, dir_type])
                if dir_type in [DKEY_CFGS, DKEY_LOGS]:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.configDir, dir_type, process_name)
                else:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.outputDir, dir_type, process_name)
        for dir_type in [DKEY_CFGS, DKEY_LOGS]:
            initDict(self.dirs, [dir_type])
            if dir_type in [DKEY_CFGS, DKEY_LOGS]:
                self.dirs[dir_type] = os.path.join(self.configDir, dir_type)
            else:
                self.dirs[dir_type] = os.path.join(self.outputDir, dir_type)

        self.cvmfs_error_log = {}
        self.executable = "produceNtuple.sh"
Example #19
0
    def create(self):
        """Creates all necessary config files and runs the Ntuple production -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue

            process_name = sample_info["process_name_specific"]
            is_mc = (sample_info["type"] == "mc")

            if is_mc and process_name not in self.pileup_histograms:
                raise ValueError("Missing PU distribution for %s in file %s" %
                                 (process_name, self.pileup))

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable, process_name))

            inputFileList = generateInputFileList(sample_info,
                                                  self.max_files_per_job)
            key_dir = getKey(sample_name)
            subDirs = list(
                map(
                    lambda y: os.path.join(self.dirs[key_dir][DKEY_NTUPLES],
                                           '%04d' % y),
                    set(map(lambda x: x // 1000, inputFileList.keys()))))
            for subDir in subDirs:
                create_if_not_exists(subDir)
            for jobId in inputFileList.keys():

                key_file = getKey(sample_name, jobId)

                self.inputFiles[key_file] = inputFileList[jobId]
                if len(self.inputFiles[key_file]) == 0:
                    logging.warning(
                        "ntupleFiles['%s'] = %s --> skipping job !!" %
                        (key_file, self.inputFiles[key_file]))
                    continue
                self.cfgFiles_prodNtuple_modified[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "produceNtuple_%s_%i_cfg.py" % (process_name, jobId))
                self.outputFiles[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_NTUPLES], "%04d" % (jobId // 1000),
                    "tree_%i.root" % jobId)
                self.logFiles_prodNtuple[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_LOGS],
                    "produceNtuple_%s_%i.log" % (process_name, jobId))
                hlt_paths = sample_info["hlt_paths"] if not is_mc else []
                hlt_cuts = list(
                    Triggers(self.era).triggers_flat
                ) if self.preselection_cuts["applyHLTcut"] else []
                jobOptions = {
                    'inputFiles': self.inputFiles[key_file],
                    'cfgFile_modified':
                    self.cfgFiles_prodNtuple_modified[key_file],
                    'outputFile': self.outputFiles[key_file],
                    'is_mc': is_mc,
                    'random_seed': jobId,
                    'process_name': process_name,
                    'category_name': sample_info["sample_category"],
                    'triggers': hlt_paths,
                    'HLTcuts': hlt_cuts,
                }
                self.createCfg_prodNtuple(jobOptions)

        num_jobs = 0
        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable)
            num_jobs = self.createScript_sbatch()
            logging.info("Generated %i job(s)" % num_jobs)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_prodNtuple(lines_makefile)
        self.createMakefile(lines_makefile)

        logging.info("Done")
        return num_jobs
Example #20
0
    sample_suffix = "sync" if use_nonnominal else "sync_nom"
    if use_preselected:
        sample_suffix = "preselected_{}".format(sample_suffix)
    samples = load_samples(era, suffix=sample_suffix)
    leptonSelection = "Fakeable"
    hadTauWP_map = {
        'dR03mva': 'Loose',
        'deepVSj': 'VLoose',
    }
    hadTauWP = tau_id + hadTauWP_map[tau_id]
else:
    raise ValueError("Invalid mode: %s" % mode)

if __name__ == '__main__':
    logging.info(
      "Running the jobs with the following systematic uncertainties enabled: %s" % \
      ', '.join(central_or_shifts)
    )

    if sample_filter:
        samples = filter_samples(samples, sample_filter)

    if args.tau_id_wp:
        logging.info("Changing tau ID WP: %s -> %s" %
                     (hadTauWP, args.tau_id_wp))
        hadTauWP = args.tau_id_wp
    hadTauSelectionAndWP = '%s|%s' % (hadTauSelection, hadTauWP)

    addMEMProduction = addMEMConfig_3l_1tau(
        treeName='Events',
        outputDir=os.path.join("/hdfs/local", getpass.getuser(), "addMEM", era,
                               version),
Example #21
0
    if args.tau_id_wp:
        tau_id = args.tau_id[:7]
    hadTau_selection_relaxed = tau_id + hadTauWP_map_relaxed[tau_id]
else:
    raise ValueError("Invalid mode: %s" % mode)

for sample_name, sample_info in samples.items():
    if sample_name == 'sum_events': continue
    if sample_name.startswith(("/DoubleEG/", "/DoubleMuon/", "/MuonEG/")):
        sample_info["use_it"] = False
    elif sample_name.startswith("/Tau/"):
        sample_info["use_it"] = True

if __name__ == '__main__':
    logging.info(
      "Running the jobs with the following systematic uncertainties enabled: %s" % \
      ', '.join(central_or_shifts)
    )
    if not use_preselected:
        logging.warning('Running the analysis on fully inclusive samples!')

    if sample_filter:
        samples = filter_samples(samples, sample_filter)

    if args.tau_id_wp:
        logging.info("Changing tau ID working point: %s -> %s" %
                     (hadTau_selection, args.tau_id_wp))
        hadTau_selection = args.tau_id_wp

    analysis = analyzeConfig_hh_1l_3tau(
        configDir=os.path.join("/scratch-persistent", getpass.getuser(),
                               "hhAnalysis", era, version),
Example #22
0
  def create(self):
    """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system
    """

    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]:
        continue
      process_name = sample_info["process_name_specific"]
      for lepton_selection in self.lepton_selections:
        for lepton_frWeight in self.lepton_frWeights:
          if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"):
            continue
          lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight)
          central_or_shifts_extended = [ "" ]
          central_or_shifts_extended.extend(self.central_or_shifts)
          central_or_shifts_extended.extend([ "hadd", "addBackgrounds" ])
          for central_or_shift_or_dummy in central_or_shifts_extended:
            process_name_extended = [ process_name, "hadd" ]
            for process_name_or_dummy in process_name_extended:
              key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, central_or_shift_or_dummy)
              for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_RLES, DKEY_SYNC ]:
                initDict(self.dirs, [ key_dir, dir_type ])
                if dir_type in [ DKEY_CFGS, DKEY_LOGS ]:
                  self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel,
                    "_".join([ lepton_selection_and_frWeight ]), process_name_or_dummy, central_or_shift_or_dummy)
                else:
                  self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel,
                    "_".join([ lepton_selection_and_frWeight ]), process_name_or_dummy, central_or_shift_or_dummy)
    for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "prepareDatacards", "addSystFakeRates", "makePlots" ]:
      key_dir = getKey(subdirectory)
      for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_DCRD, DKEY_PLOT ]:
        initDict(self.dirs, [ key_dir, dir_type ])
        if dir_type in [ DKEY_CFGS, DKEY_LOGS ]:
          self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory)
        else:
          self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory)
    for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]:
      initDict(self.dirs, [ dir_type ])
      if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]:
        self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel)
      else:
        self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel)

    numDirectories = 0
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        numDirectories += len(self.dirs[key])
      else:
        numDirectories += 1
    logging.info("Creating directory structure (numDirectories = %i)" % numDirectories)
    numDirectories_created = 0;
    frac = 1
    for key in self.dirs.keys():
      if type(self.dirs[key]) == dict:
        for dir_type in self.dirs[key].keys():
          create_if_not_exists(self.dirs[key][dir_type])
        numDirectories_created += len(self.dirs[key])
      else:
        create_if_not_exists(self.dirs[key])
        numDirectories_created = numDirectories_created + 1
      while 100*numDirectories_created >= frac*numDirectories:
        logging.info(" %i%% completed" % frac)
        frac = frac + 1
    logging.info("Done.")

    inputFileLists = {}
    for sample_name, sample_info in self.samples.items():
      if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]:
        continue
      logging.info("Checking input files for sample %s" % sample_info["process_name_specific"])
      inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job)

    mcClosure_regex = re.compile('Fakeable_mcClosure_(?P<type>m|e)_wFakeRateWeights')
    for lepton_selection in self.lepton_selections:
      electron_selection = lepton_selection
      muon_selection = lepton_selection

      hadTauVeto_selection = "Tight"
      hadTauVeto_selection = "|".join([ hadTauVeto_selection, self.hadTauVeto_selection_part2 ])

      if lepton_selection == "Fakeable_mcClosure_e":
        electron_selection = "Fakeable"
        muon_selection = "Tight"
      elif lepton_selection == "Fakeable_mcClosure_m":
        electron_selection = "Tight"
        muon_selection = "Fakeable"

      for lepton_frWeight in self.lepton_frWeights:
        if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"):
          continue
        if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight" ]:
          continue
        lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight)

        for sample_name, sample_info in self.samples.items():
          if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]:
            continue
          process_name = sample_info["process_name_specific"]
          logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name))

          sample_category = sample_info["sample_category"]
          is_mc = (sample_info["type"] == "mc")
          is_signal = (sample_category == "signal")

          for central_or_shift in self.central_or_shifts:

            inputFileList = inputFileLists[sample_name]
            for jobId in inputFileList.keys():
              if central_or_shift != "central":
                isFR_shape_shift = (central_or_shift in systematics.FR_all)
                if not ((lepton_selection == "Fakeable" and isFR_shape_shift) or lepton_selection == "Tight"):
                  continue
                if not is_mc and not isFR_shape_shift:
                  continue

              if central_or_shift in systematics.LHE().ttH and sample_category != "signal":
                continue
              if central_or_shift in systematics.LHE().ttW and sample_category != "TTW":
                continue
              if central_or_shift in systematics.LHE().ttZ and sample_category != "TTZ":
                continue
              if central_or_shift in systematics.DYMCReweighting and not is_dymc_reweighting(sample_name):
                continue

              logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift))

              # build config files for executing analysis code
              key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, central_or_shift)
              analyze_job_tuple = (process_name, lepton_selection_and_frWeight, central_or_shift, jobId)
              key_analyze_job = getKey(*analyze_job_tuple)
              ntupleFiles = inputFileList[jobId]
              if len(ntupleFiles) == 0:
                logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job))
                continue

              syncOutput = ''
              syncTree = ''
              syncRequireGenMatching = True
              if self.do_sync:
                mcClosure_match = mcClosure_regex.match(lepton_selection_and_frWeight)
                if lepton_selection_and_frWeight == 'Tight':
                  syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_SR.root' % (self.channel, central_or_shift))
                  syncTree = 'syncTree_%s_SR' % self.channel
                  syncRequireGenMatching = True
                elif lepton_selection_and_frWeight == 'Fakeable_wFakeRateWeights':
                  syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Fake.root' % (self.channel, central_or_shift))
                  syncTree = 'syncTree_%s_Fake' % self.channel
                elif mcClosure_match:
                  mcClosure_type = mcClosure_match.group('type')
                  syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_mcClosure_%s.root' % (self.channel, central_or_shift, mcClosure_type))
                  syncTree = 'syncTree_%s_mcClosure_%s' % (self.channel, mcClosure_type)
                else:
                  continue
              if syncTree and central_or_shift != "central":
                syncTree = os.path.join(central_or_shift, syncTree)
              syncRLE = ''
              if self.do_sync and self.rle_select:
                syncRLE = self.rle_select % syncTree
                if not os.path.isfile(syncRLE):
                  logging.warning("Input RLE file for the sync is missing: %s; skipping the job" % syncRLE)
                  continue
              if syncOutput:
                self.inputFiles_sync['sync'].append(syncOutput)

              cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple)
              logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple)
              rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \
                                   if self.select_rle_output else ""
              histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple)

              self.jobOptions_analyze[key_analyze_job] = {
                'ntupleFiles'              : ntupleFiles,
                'cfgFile_modified'         : cfgFile_modified_path,
                'histogramFile'            : histogramFile_path,
                'logFile'                  : logFile_path,
                'selEventsFileName_output' : rleOutputFile_path,
                'electronSelection'        : electron_selection,
                'muonSelection'            : muon_selection,
                'apply_leptonGenMatching'  : self.apply_leptonGenMatching,
                'hadTauSelection_veto'     : hadTauVeto_selection,
                'applyFakeRateWeights'     : self.applyFakeRateWeights if not lepton_selection == "Tight" else "disabled",
                'central_or_shift'         : central_or_shift,
                'syncOutput'               : syncOutput,
                'syncTree'                 : syncTree,
                'syncRLE'                  : syncRLE,
                'syncRequireGenMatching'   : syncRequireGenMatching,
                'useNonNominal'            : self.use_nonnominal,
                'apply_hlt_filter'         : self.hlt_filter,
              }
              self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection)

              # initialize input and output file names for hadd_stage1
              key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight)
              hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight)
              key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple)
              if not key_hadd_stage1_job in self.inputFiles_hadd_stage1:
                self.inputFiles_hadd_stage1[key_hadd_stage1_job] = []
              self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile'])
              self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST],
                                                                              "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple)

          if self.do_sync: continue

          if is_mc:
            logging.info("Creating configuration files to run 'addBackgrounds' for sample %s" % process_name)

            sample_categories = [ sample_category ]
            if is_signal:
              sample_categories = [ "signal", "ttH", "ttH_htt", "ttH_hww", "ttH_hzz", "ttH_hmm", "ttH_hzg" ]
            for sample_category in sample_categories:
              # sum non-fake and fake contributions for each MC sample separately
              genMatch_categories = [ "nonfake", "conversions", "fake" ]
              for genMatch_category in genMatch_categories:
                key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight)
                key_addBackgrounds_dir = getKey(process_name, lepton_selection_and_frWeight, "addBackgrounds")
                addBackgrounds_job_tuple = None
                processes_input = None
                process_output = None
                if genMatch_category == "nonfake":
                  # sum non-fake contributions for each MC sample separately
                  # input processes: TT3l0g0j,...
                  # output processes: TT; ...
                  if sample_category in [ "signal" ]:
                    lepton_genMatches = []
                    lepton_genMatches.extend(self.lepton_genMatches_nonfakes)
                    lepton_genMatches.extend(self.lepton_genMatches_conversions)
                    lepton_genMatches.extend(self.lepton_genMatches_fakes)
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in lepton_genMatches ]
                  elif sample_category in [ "ttH" ]:
                    lepton_genMatches = []
                    lepton_genMatches.extend(self.lepton_genMatches_nonfakes)
                    lepton_genMatches.extend(self.lepton_genMatches_conversions)
                    processes_input = []
                    processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in lepton_genMatches ])
                    processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in lepton_genMatches ])
                    processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in lepton_genMatches ])
                    processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in lepton_genMatches ])
                    processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in lepton_genMatches ])
                  else:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_nonfakes ]
                  process_output = sample_category
                  addBackgrounds_job_tuple = (process_name, sample_category, lepton_selection_and_frWeight)
                elif genMatch_category == "conversions":
                  # sum fake contributions for each MC sample separately
                  # input processes: TT2l1g0j, TT1l2g0j, TT0l3g0j; ...
                  # output processes: TT_conversion; ...
                  if sample_category in [ "signal" ]:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_conversions ]
                  elif sample_category in [ "ttH" ]:
                    processes_input = []
                    processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                    processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                    processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                    processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                    processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_conversions ])
                  else:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_conversions ]
                  process_output = "%s_conversion" % sample_category
                  addBackgrounds_job_tuple = (process_name, "%s_conversion" % sample_category, lepton_selection_and_frWeight)
                elif genMatch_category == "fake":
                  # sum fake contributions for each MC sample separately
                  # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l2g1j, TT0l1g2j, TT0l0g3j; ...
                  # output processes: TT_fake; ...
                  if sample_category in [ "signal" ]:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ]
                  elif sample_category in [ "ttH" ]:
                    processes_input = []
                    processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                    processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_fakes ])
                  else:
                    processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ]
                  process_output = "%s_fake" % sample_category
                  addBackgrounds_job_tuple = (process_name, "%s_fake" % sample_category, lepton_selection_and_frWeight)
                if processes_input:
                  logging.info(" ...for genMatch option = '%s'" % genMatch_category)
                  key_addBackgrounds_job = getKey(*addBackgrounds_job_tuple)
                  cfgFile_modified = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_tuple)
                  outputFile = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_tuple)
                  self.jobOptions_addBackgrounds[key_addBackgrounds_job] = {
                    'inputFile' : self.outputFile_hadd_stage1[key_hadd_stage1_job],
                    'cfgFile_modified' : cfgFile_modified,
                    'outputFile' : outputFile,
                    'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(cfgFile_modified).replace("_cfg.py", ".log")),
                    'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ],
                    'processes_input' : processes_input,
                    'process_output' : process_output
                  }
                  self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds[key_addBackgrounds_job])

                  # initialize input and output file names for hadd_stage1_5
                  key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight)                  
                  key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight)
                  if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5:
                    self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = []
                  self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.jobOptions_addBackgrounds[key_addBackgrounds_job]['outputFile'])
                  self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST],
                                                                                      "hadd_stage1_5_%s.root" % lepton_selection_and_frWeight)

          # add output files of hadd_stage1 for data to list of input files for hadd_stage1_5
          if not is_mc:
            key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight)
            key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight)
            if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5:
              self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = []
            self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job])

        if self.do_sync: continue

        # sum fake background contributions for the total of all MC sample
        # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l3j, TT0l3j, TT0l3j, TT0l3j; ...
        # output process: fakes_mc
        key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight)
        key_addBackgrounds_dir = getKey("addBackgrounds")
        addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight)
        key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple)
        sample_categories = []
        sample_categories.extend(self.nonfake_backgrounds)
        sample_categories.extend([ "signal" ])
        processes_input = []
        for sample_category in sample_categories:
          processes_input.append("%s_fake" % sample_category)
        self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = {
          'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
          'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple),
          'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s.root" % addBackgrounds_job_fakes_tuple),
          'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s.log" % addBackgrounds_job_fakes_tuple),
          'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ],
          'processes_input' : processes_input,
          'process_output' : "fakes_mc"
        }
        self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes])

        # sum conversion background contributions for the total of all MC sample
        # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l3j, TT0l3j, TT0l3j, TT0l3j; ...
        # output process: conversions
        addBackgrounds_job_conversions_tuple = ("conversions", lepton_selection_and_frWeight)
        key_addBackgrounds_job_conversions = getKey(*addBackgrounds_job_conversions_tuple)
        sample_categories = []
        sample_categories.extend(self.nonfake_backgrounds)
        sample_categories.extend([ "signal" ])
        processes_input = []
        for sample_category in sample_categories:
          processes_input.append("%s_conversion" % sample_category)
        self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions] = {
          'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
          'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_cfg.py" % addBackgrounds_job_conversions_tuple),
          'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s.root" % addBackgrounds_job_conversions_tuple),
          'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s.log" % addBackgrounds_job_conversions_tuple),
          'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ],
          'processes_input' : processes_input,
          'process_output' : "conversions"
        }
        self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions])

        # initialize input and output file names for hadd_stage2
        key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight)
        key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight)        
        key_hadd_stage2_job = getKey(lepton_selection_and_frWeight)
        if not key_hadd_stage2_job in self.inputFiles_hadd_stage2:
          self.inputFiles_hadd_stage2[key_hadd_stage2_job] = []
        if lepton_selection == "Tight":
          self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'])
          self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions]['outputFile'])        
        self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job])
        self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST],
                                                                        "hadd_stage2_%s.root" % lepton_selection_and_frWeight)

    if self.do_sync:
      if self.is_sbatch:
        logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
        self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
        self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      logging.info("Creating Makefile")
      lines_makefile = []
      self.addToMakefile_syncNtuple(lines_makefile)
      outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel)
      self.outputFile_sync['sync'] = outputFile_sync_path
      self.targets.append(outputFile_sync_path)
      self.addToMakefile_hadd_sync(lines_makefile)
      self.createMakefile(lines_makefile)
      logging.info("Done.")
      return self.num_jobs

    logging.info("Creating configuration files to run 'addBackgroundFakes'")
    key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled"))
    key_addFakes_job = getKey("fakes_data")
    category_sideband = "ttZctrl_Fakeable_wFakeRateWeights"
    self.jobOptions_addFakes[key_addFakes_job] = {
      'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job],
      'cfgFile_modified' : os.path.join(self.dirs[DKEY_CFGS], "addBackgroundLeptonFakes_cfg.py"),
      'outputFile' : os.path.join(self.dirs[DKEY_HIST], "addBackgroundLeptonFakes.root"),
      'logFile' : os.path.join(self.dirs[DKEY_LOGS], "addBackgroundLeptonFakes.log"),
      'category_signal' : "ttZctrl_Tight",
      'category_sideband' : category_sideband
    }
    self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job])
    key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"))
    self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile'])

    logging.info("Creating configuration files to run 'prepareDatacards'")
    for histogramToFit in self.histograms_to_fit:
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"))
      key_prep_dcard_dir = getKey("prepareDatacards")
      prep_dcard_job_tuple = (self.channel, histogramToFit)
      key_prep_dcard_job = getKey(histogramToFit)      
      self.jobOptions_prep_dcard[key_prep_dcard_job] = {
        'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
        'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple),
        'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple),
        'histogramDir' : self.histogramDir_prep_dcard,
        'histogramToFit' : histogramToFit,
        'label' : None
      }
      self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job])

      # add shape templates for the following systematic uncertainties:
      #  - 'CMS_ttHl_Clos_norm_e'
      #  - 'CMS_ttHl_Clos_shape_e'
      #  - 'CMS_ttHl_Clos_norm_m'
      #  - 'CMS_ttHl_Clos_shape_m'
      key_prep_dcard_job = getKey(histogramToFit)
      key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"))
      key_add_syst_fakerate_dir = getKey("addSystFakeRates")                                    
      add_syst_fakerate_job_tuple = (self.channel, histogramToFit) 
      key_add_syst_fakerate_job = getKey(histogramToFit)      
      self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = {
        'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'],
        'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_cfg.py" % add_syst_fakerate_job_tuple),
        'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s.root" % add_syst_fakerate_job_tuple),
        'category' : self.channel,
        'histogramToFit' : histogramToFit,
        'plots_outputFileName' : os.path.join(self.dirs[DKEY_PLOT], "addSystFakeRates.png")
      }
      histogramDir_nominal = self.histogramDir_prep_dcard
      for lepton_type in [ 'e', 'm' ]:
        lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type
        if lepton_mcClosure not in self.lepton_selections:
          continue
        lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled")
        key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight)
        histogramDir_mcClosure = self.mcClosure_dir[lepton_mcClosure]
        self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({
          'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections,
          'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job],
          'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit),
          'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'],
          'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit)
        })
      self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job])

    logging.info("Creating configuration files to run 'makePlots'")
    key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"))
    key_makePlots_dir = getKey("makePlots")                                       
    key_makePlots_job = getKey('')
    self.jobOptions_make_plots[key_makePlots_job] = {
      'executable' : self.executable_make_plots,
      'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job],
      'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel),
      'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel),
      'histogramDir' : self.histogramDir_prep_dcard,
      'label' : "t#bar{t}Z control region",
      'make_plots_backgrounds' : self.make_plots_backgrounds
    }
    self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job])

    if self.is_sbatch:
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze)
      self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel)
      self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds)
      self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel)
      self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds)
      self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel)
      self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum)
      logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes)
      self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel)
      self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes)

    logging.info("Creating Makefile")
    lines_makefile = []
    self.addToMakefile_analyze(lines_makefile)
    self.addToMakefile_hadd_stage1(lines_makefile)
    self.addToMakefile_backgrounds_from_data(lines_makefile)
    self.addToMakefile_hadd_stage2(lines_makefile)
    self.addToMakefile_prep_dcard(lines_makefile)
    self.addToMakefile_add_syst_fakerate(lines_makefile)
    self.addToMakefile_make_plots(lines_makefile)
    self.createMakefile(lines_makefile)

    logging.info("Done.")

    return self.num_jobs
Example #23
0
        if sample_name == 'sum_events':
            continue
        if re.match("(^WZTo3LNu$|^WZTo3LNu_ext(\d)?$)",
                    sample_info["process_name_specific"]):
            sample_info["use_it"] = True

if rle_filter_file:
    rle_filter_file = os.path.join(os.environ['CMSSW_BASE'], 'src',
                                   'tthAnalysis', 'HiggsToTauTau', 'data',
                                   'mem', rle_filter_file)
    if not os.path.isfile(rle_filter_file):
        raise ValueError("No such file: %s" % rle_filter_file)

if __name__ == '__main__':
    logging.info(
      "Running the jobs with the following systematic uncertainties enabled: %s" % \
      ', '.join(central_or_shifts)
    )

    if sample_filter:
        samples = filter_samples(samples, sample_filter)

    addMEMProduction = addMEMConfig_3l(
        treeName='Events',
        outputDir=os.path.join("/hdfs/local", getpass.getuser(), "addMEM", era,
                               version),
        cfgDir=os.path.join("/home", getpass.getuser(), "addMEM", era,
                            version),
        executable_addMEM="addMEM_3l",
        samples=samples,
        era=era,
        check_output_files=check_output_files,
Example #24
0
    def create(self):
        """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue

            process_name = sample_info["process_name_specific"]
            is_mc = (sample_info["type"] == "mc")

            if not is_mc:
                continue

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable, process_name))

            inputFileList = generateInputFileList(sample_info,
                                                  self.max_files_per_job)
            key_dir = getKey(process_name)

            outputFile = os.path.join(self.dirs[key_dir][DKEY_HISTO],
                                      "%s.root" % process_name)
            if os.path.isfile(outputFile) and tools_is_file_ok(
                    outputFile, min_file_size=2000):
                logging.info('File {} already exists --> skipping job'.format(
                    outputFile))
                continue

            self.outputFiles[process_name] = {
                'inputFiles': [],
                'outputFile': outputFile
            }

            for jobId in inputFileList.keys():

                key_file = getKey(sample_name, jobId)

                self.inputFiles[key_file] = inputFileList[jobId]
                if len(self.inputFiles[key_file]) == 0:
                    logging.warning(
                        "ntupleFiles['%s'] = %s --> skipping job !!" %
                        (key_file, self.inputFiles[key_file]))
                    continue

                self.cfgFiles_puProfile[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "puProfile_%s_%i_cfg.txt" % (process_name, jobId))
                self.outputFiles_tmp[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_HISTO_TMP],
                    "histogram_%i.root" % jobId)
                self.logFiles_puProfile[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_LOGS],
                    "puProfile_%s_%i.log" % (process_name, jobId))
                self.scriptFiles_puProfile[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "puProfile_%s_%i_cfg.sh" % (process_name, jobId))
                self.jobOptions_sbatch[key_file] = {
                    'histName': process_name,
                    'inputFiles': self.inputFiles[key_file],
                    'cfgFile_path': self.cfgFiles_puProfile[key_file],
                    'outputFile': self.outputFiles_tmp[key_file],
                    'logFile': self.logFiles_puProfile[key_file],
                    'scriptFile': self.scriptFiles_puProfile[key_file],
                }
                self.createCfg_puProfile(self.jobOptions_sbatch[key_file])
                self.outputFiles[process_name]['inputFiles'].append(
                    self.outputFiles_tmp[key_file])

        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable)
            self.num_jobs['puProfile'] += self.createScript_sbatch(
                self.executable, self.sbatchFile_puProfile,
                self.jobOptions_sbatch)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_puProfile(lines_makefile)
        self.addToMakefile_hadd(lines_makefile)
        self.addToMakefile_plot(lines_makefile)
        self.addToMakefile_finalHadd(lines_makefile)
        self.createMakefile(lines_makefile)
        logging.info("Done")

        return self.num_jobs
Example #25
0
    def create(self):
        """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue

            process_name = sample_info["process_name_specific"]
            is_mc = (sample_info["type"] == "mc")

            if not is_mc:
                continue

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable, process_name))

            inputFileList = generateInputFileList(sample_info,
                                                  self.max_files_per_job)
            key_dir = getKey(process_name)

            outputFile = os.path.join(self.dirs[key_dir][DKEY_HISTO],
                                      "%s.root" % process_name)
            self.outputFiles[process_name] = {
                'inputFiles': [],
                'outputFile': outputFile,
            }
            if os.path.isfile(outputFile) and tools_is_file_ok(
                    outputFile, min_file_size=2000):
                logging.info('File {} already exists --> skipping job'.format(
                    outputFile))
                continue

            for jobId in inputFileList.keys():

                key_file = getKey(sample_name, jobId)

                self.inputFiles[key_file] = inputFileList[jobId]
                if len(self.inputFiles[key_file]) == 0:
                    logging.warning("'%s' = %s --> skipping job !!" %
                                    (key_file, self.inputFiles[key_file]))
                    continue

                self.cfgFiles_projection[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "project_%s_%i_cfg.txt" % (process_name, jobId))
                self.outputFiles_tmp[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_HISTO_TMP],
                    "histogram_%i.root" % jobId)
                self.logFiles_projection[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_LOGS],
                    "project_%s_%i.log" % (process_name, jobId))
                self.scriptFiles_projection[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS],
                    "project_%s_%i_cfg.sh" % (process_name, jobId))
                projection_module = self.projection_module
                if projection_module == "count":
                    projection_module = "countHistogramAll"
                    if sample_name.startswith('/TTTo'):
                        projection_module += "CompTopRwgt"
                    elif sample_info['sample_category'].startswith('ttH'):
                        projection_module += "CompHTXS"
                    elif isSplitByNlheJet(process_name):
                        projection_module += "SplitByLHENjet"
                    elif isSplitByNlheHT(process_name):
                        projection_module += "SplitByLHEHT"
                    elif isSplitByNlheJetHT(process_name, sample_name):
                        projection_module += "SplitByLHENjetHT"
                self.jobOptions_sbatch[key_file] = {
                    'histName': process_name,
                    'inputFiles': self.inputFiles[key_file],
                    'cfgFile_path': self.cfgFiles_projection[key_file],
                    'outputFile': self.outputFiles_tmp[key_file],
                    'logFile': self.logFiles_projection[key_file],
                    'scriptFile': self.scriptFiles_projection[key_file],
                    'projection_module': projection_module,
                }
                if self.projection_module != 'puHist':
                    self.jobOptions_sbatch[key_file][
                        'ref_genWeight'] = self.ref_genWeights[process_name]
                    if process_name not in self.ref_genWeights:
                        raise RuntimeError(
                            "Unable to find reference LHE weight for process %s"
                            % process_name)
                self.createCfg_project(self.jobOptions_sbatch[key_file])
                self.outputFiles[process_name]['inputFiles'].append(
                    self.outputFiles_tmp[key_file])

        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable)
            self.num_jobs['project'] += self.createScript_sbatch(
                self.executable, self.sbatchFile_projection,
                self.jobOptions_sbatch)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_project(lines_makefile)
        self.addToMakefile_hadd(lines_makefile)
        if self.plot:
            self.addToMakefile_plot(lines_makefile)
        self.addToMakefile_finalHadd(lines_makefile)
        self.createMakefile(lines_makefile)
        logging.info("Done")

        return self.num_jobs
Example #26
0
    def poll(self, nonBlocking):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """
        text_line = '-' * 120

        # Set a delimiter, which distinguishes entries b/w different jobs
        delimiter = ','
        # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length):
        # 1) squeue -h -u {{user}} -o '%i %256k'
        #      Collects the list of running jobs
        #        a) -h omits header
        #        b) -u {{user}} looks only for jobs submitted by {{user}}
        #        c) -o '%i %256k' specifies the output format
        #           i)  %i -- job ID (1st column)
        #           ii) %256k -- comment with width of 256 characters (2nd column)
        #               If the job has no comments, the entry simply reads (null)
        # 2) grep {{comment}}
        #       Filter the jobs by the comment which must be unique per sbatchManager instance at all times
        # 3) awk '{print $1}'
        #       Filter only the jobIds out
        # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'
        #       Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read)
        command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \
                           "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'"
        command = jinja2.Template(command_template).render(
            user=self.user,
            pool_id_length=self.max_pool_id_length,
            comment=self.pool_id,
            delimiter=delimiter)

        # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes
        # even if some of them have already finished
        jobIds_set = set([
            job_id for job_id in self.submittedJobs
            if self.submittedJobs[job_id]['status'] == Status.submitted
        ])
        nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
        while nofJobs_left > 0:
            # Get the list of jobs submitted to batch system and convert their jobIds to a set
            poll_result, poll_result_err = '', ''
            while True:
                poll_result, poll_result_err = run_cmd(command,
                                                       do_not_log=False,
                                                       return_stderr=True)
                if not poll_result and poll_result_err:
                    logging.warning(
                        'squeue caught an error: {squeue_error}'.format(
                            squeue_error=poll_result_err))
                else:
                    break
                # sleep a minute and then try again
                # in principle we could limit the number of retries, but hopefully that's not necessary
                logging.debug("sleeping for %i seconds." % 60)
                time.sleep(60)
            polled_ids = set()
            if poll_result != '':
                polled_ids = set(poll_result.split(delimiter))

            # Check if number of jobs submitted to batch system is below maxSubmittedJobs;
            # if it is, take jobs from queuedJobs list and submit them,
            # until a total of maxSubmittedJobs is submitted to batch system
            nofJobs_toSubmit = min(len(self.queuedJobs),
                                   self.maxSubmittedJobs - len(polled_ids))
            if nofJobs_toSubmit > 0:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs."
                    .format(len(polled_ids), len(self.queuedJobs),
                            nofJobs_toSubmit))
            else:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing."
                    .format(len(polled_ids), len(self.queuedJobs)))
            for i in range(0, nofJobs_toSubmit):
                # randomly submit a job from the queue
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_idx = len(self.queuedJobs) - 1
                random_idx = random.randint(0, max_idx)
                job = self.queuedJobs.pop(random_idx)
                job['status'] = Status.submitted
                job_id = self.submit(job['sbatch_command'])
                self.submittedJobs[job_id] = job

            # Now check status of jobs submitted to batch system:
            # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of
            # jobs that have finished already
            finished_ids = list(jobIds_set - polled_ids)

            # Do not poll anything if currently there are no finished jobs
            if finished_ids:
                # Based on job's exit code what if the job has failed or completed successfully
                # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here
                # Therefore, we want to restrict the output by grepping specific job IDs
                # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable,
                # which is of order 2e6
                # This means that we have to split the job IDs into chunks each of which we have to check separately
                finished_ids_chunks = [
                    finished_ids[i:i + self.max_nof_greps]
                    for i in range(0, len(finished_ids), self.max_nof_greps)
                ]
                for finished_ids_chunk in finished_ids_chunks:
                    completion = self.check_job_completion(finished_ids_chunk)
                    completed_jobs, running_jobs, failed_jobs = [], [], []
                    for job_id, details in completion.iteritems():
                        if details.status == Status.completed:
                            completed_jobs.append(job_id)
                        elif details.status == Status.running:
                            running_jobs.append(job_id)
                        else:
                            failed_jobs.append(job_id)
                    # If there are any failed jobs, throw
                    if failed_jobs:

                        failed_jobs_str = ','.join(failed_jobs)
                        errors = [
                            completion[job_id].status for job_id in failed_jobs
                        ]
                        logging.error(
                            "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}"
                            .format(
                                jobIds=failed_jobs_str,
                                reasons=', '.join(map(Status.toString,
                                                      errors)),
                            ))

                        # Let's print a table where the first column corresponds to the job ID
                        # and the second column lists the exit code, the derived exit code, the status
                        # and the classification of the failed job
                        logging.error("Error table:")
                        for job_id in failed_jobs:
                            sys.stderr.write(
                                "{jobId} {exitCode} {derivedExitCode} {state} {status}\n"
                                .format(
                                    jobId=job_id,
                                    exitCode=completion[job_id].exit_code,
                                    derivedExitCode=completion[job_id].
                                    derived_exit_code,
                                    state=completion[job_id].state,
                                    status=Status.toString(
                                        completion[job_id].status),
                                ))

                        sys.stderr.write('%s\n' % text_line)
                        for failed_job in failed_jobs:
                            for log in zip(['wrapper', 'executable'],
                                           ['log_wrap', 'log_exec']):
                                logfile = self.submittedJobs[failed_job][
                                    log[1]]
                                if os.path.isfile(logfile):
                                    logfile_contents = open(logfile,
                                                            'r').read()
                                else:
                                    logfile_contents = '<file is missing>'
                                sys.stderr.write(
                                    'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n'
                                    .format(
                                        id=failed_job,
                                        description=log[0],
                                        path=logfile,
                                        log=logfile_contents,
                                        line=text_line,
                                    ))

                            if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \
                               completion[failed_job].status == Status.io_error:
                                # The job is eligible for resubmission if the job hasn't been resubmitted more
                                # than a preset limit of resubmissions AND if the job failed due to I/O errors
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED because: {reason} "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=failed_job,
                                        args=self.submittedJobs[failed_job]
                                        ['args'],
                                        reason=Status.toString(
                                            completion[failed_job].status),
                                        attempt=self.submittedJobs[failed_job]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[failed_job]['args'])
                                # The old ID must be deleted, b/c otherwise it would be used to compare against
                                # squeue output and we would resubmit the failed job ad infinitum
                                del self.submittedJobs[failed_job]
                            else:
                                # We've exceeded the maximum number of resubmissions -> fail the workflow
                                raise Status.raiseError(
                                    completion[failed_job].status)
                    else:
                        logging.debug(
                            "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}"
                            .format(
                                completedIds=','.join(completed_jobs),
                                runningInfo='(%s still running)' %
                                ','.join(running_jobs) if running_jobs else '',
                            ))
                    # Mark successfully finished jobs as completed so that won't request their status code again
                    # Otherwise they will be still at ,,submitted'' state
                    for job_id in completed_jobs:
                        if not all(
                                map(
                                    lambda outputFile: is_file_ok(
                                        outputFile,
                                        validate_outputs=True,
                                        min_file_size=self.min_file_size), self
                                    .submittedJobs[job_id]['outputFiles'])):
                            if self.submittedJobs[job_id][
                                    'nof_submissions'] < self.max_resubmissions:
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=job_id,
                                        args=self.submittedJobs[job_id]
                                        ['args'],
                                        attempt=self.submittedJobs[job_id]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[job_id]['args'])
                                del self.submittedJobs[job_id]
                            else:
                                raise ValueError(
                                    "Job w/ ID {id} FAILED because it repeatedly produces bogus output "
                                    "file {output} yet the job still exits w/o any errors"
                                    .format(
                                        id=job_id,
                                        output=', '.join(
                                            self.submittedJobs[job_id]
                                            ['outputFiles']),
                                    ))
                        else:
                            # Job completed just fine
                            self.submittedJobs[job_id][
                                'status'] = Status.completed

            jobIds_set = set([
                job_id for job_id in self.submittedJobs
                if self.submittedJobs[job_id]['status'] == Status.submitted
            ])
            nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
            logging.info(
                "Waiting for sbatch to finish (%d job(s) still left) ..." %
                nofJobs_left)
            if nofJobs_left > 0:
                if nonBlocking:
                    return False
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_delay = 300
                random_delay = random.randint(0, max_delay)
                logging.debug("sleeping for %i seconds." % random_delay)
                time.sleep(self.poll_interval + random_delay)
            else:
                break

        return True
elif mode == "sync":
    sample_suffix = "sync" if use_nonnominal else "sync_nom"
    if use_preselected:
        sample_suffix = "preselected_{}".format(sample_suffix)
    samples = load_samples(era, suffix=sample_suffix)
else:
    raise ValueError("Invalid mode: %s" % mode)

for sample_name, sample_info in samples.items():
    if sample_name == 'sum_events': continue
    if sample_name.startswith('/Tau/Run'):
        sample_info["use_it"] = False

if __name__ == '__main__':
    logging.info(
      "Running the jobs with the following systematic uncertainties enabled: %s" % \
      ', '.join(central_or_shifts)
    )

    if sample_filter:
        samples = filter_samples(samples, sample_filter)

    configDir = os.path.join("/home", getpass.getuser(), "ttHAnalysis", era,
                             version)
    outputDir = os.path.join("/hdfs/local", getpass.getuser(), "ttHAnalysis",
                             era, version)

    analysis = analyzeConfig_2lss(
        configDir=configDir,
        outputDir=outputDir,
        executable_analyze="analyze_2lss",
        cfgFile_analyze="analyze_2lss_cfg.py",
Example #28
0
    def create(self):
        """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        self.inputFileIds = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue

            process_name = sample_info["process_name_specific"]
            is_mc = (sample_info["type"] == "mc")

            if not is_mc:
                continue

            logging.info(
                "Creating configuration files to run '%s' for sample %s" %
                (self.executable, process_name))

            inputFileList_map = generateInputFileList(sample_info, 1)
            key_dir = getKey(process_name)
            key_file = getKey(process_name)

            self.inputFiles[key_file] = list(
                itertools.chain(*inputFileList_map.values()))
            if len(self.inputFiles[key_file]) == 0:
                logging.warning("'%s' = %s --> skipping job !!" %
                                (key_file, self.inputFiles[key_file]))
                continue

            outputFile = os.path.join(self.dirs[key_dir][DKEY_RESULTS],
                                      "%s.txt" % process_name)
            self.outputFiles[key_file] = outputFile
            if os.path.isfile(outputFile):
                logging.info('File {} already exists --> skipping job'.format(
                    outputFile))
                continue

            self.cfgFiles[key_file] = os.path.join(
                self.dirs[key_dir][DKEY_CFGS],
                "refGenWeight_%s_cfg.txt" % (process_name))
            self.logFiles[key_file] = os.path.join(
                self.dirs[key_dir][DKEY_LOGS],
                "refGenWeight_%s.log" % (process_name))
            self.scriptFiles[key_file] = os.path.join(
                self.dirs[key_dir][DKEY_CFGS],
                "refGenWeight_%s_cfg.sh" % (process_name))
            self.plotFiles[key_file] = ' '.join([
                os.path.join(self.dirs[key_dir][DKEY_PLOTS],
                             "refGenWeight_%s.%s" % (process_name, extension))
                for extension in ['pdf', 'png']
            ])

            self.jobOptions_sbatch[key_file] = {
                'inputFiles':
                self.inputFiles[key_file],
                'cfgFile_path':
                self.cfgFiles[key_file],
                'cmdParams':
                "-i {} -o {} -p {} -v".format(
                    self.cfgFiles[key_file],
                    self.outputFiles[key_file],
                    self.plotFiles[key_file],
                ),
                'outputFile':
                self.outputFiles[key_file],
                'logFile':
                self.logFiles[key_file],
                'scriptFile':
                self.scriptFiles[key_file],
            }
            self.createCfg(self.jobOptions_sbatch[key_file])

        if self.is_sbatch:
            logging.info(
                "Creating script for submitting '%s' jobs to batch system" %
                self.executable)
            self.num_jobs['refGenWeight'] += self.createScript_sbatch(
                self.executable, self.sbatchFile, self.jobOptions_sbatch)

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile(lines_makefile)
        self.addToMakefile_final(lines_makefile)
        self.createMakefile(lines_makefile)
        logging.info("Done")

        return self.num_jobs
Example #29
0
    def __init__(
        self,
        configDir,
        outputDir,
        output_file,
        executable,
        projection_module,
        samples,
        max_files_per_job,
        era,
        plot,
        check_output_files,
        running_method,
        num_parallel_jobs,
        pool_id='',
        verbose=False,
        dry_run=False,
        use_home=False,
        submission_cmd=None,
    ):

        self.configDir = configDir
        self.outputDir = outputDir
        self.executable = executable
        self.projection_module = projection_module
        self.max_num_jobs = 200000
        self.samples = samples
        self.max_files_per_job = max_files_per_job
        self.era = era
        self.plot = plot
        self.check_output_files = check_output_files
        self.verbose = verbose
        self.dry_run = dry_run
        self.use_home = use_home
        if running_method.lower() not in ["sbatch", "makefile"]:
            raise ValueError("Invalid running method: %s" % running_method)

        self.running_method = running_method
        self.is_sbatch = self.running_method.lower() == "sbatch"
        self.is_makefile = not self.is_sbatch
        self.makefile = os.path.join(
            self.configDir, "Makefile_{}".format(self.projection_module))
        self.num_parallel_jobs = num_parallel_jobs
        self.pool_id = pool_id if pool_id else uuid.uuid4()

        self.workingDir = os.getcwd()
        logging.info("Working directory is: %s" % self.workingDir)
        self.template_dir = os.path.join(os.getenv('CMSSW_BASE'), 'src',
                                         'tthAnalysis', 'HiggsToTauTau',
                                         'test', 'templates')
        logging.info("Templates directory is: %s" % self.template_dir)

        create_if_not_exists(self.configDir)
        create_if_not_exists(self.outputDir)
        self.output_file = os.path.join(self.outputDir, output_file)
        self.stdout_file_path = os.path.join(
            self.configDir, "stdout_{}.log".format(self.projection_module))
        self.stderr_file_path = os.path.join(
            self.configDir, "stderr_{}.log".format(self.projection_module))
        self.sw_ver_file_cfg = os.path.join(
            self.configDir, "VERSION_{}.log".format(self.projection_module))
        self.sw_ver_file_out = os.path.join(
            self.outputDir, "VERSION_{}.log".format(self.projection_module))
        self.submission_out = os.path.join(self.configDir, "SUBMISSION.log")
        self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out = get_log_version(
            (self.stdout_file_path, self.stderr_file_path,
             self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out))
        check_submission_cmd(self.submission_out, submission_cmd)

        self.sbatchFile_projection = os.path.join(
            self.configDir, "sbatch_{}.py".format(self.projection_module))
        self.cfgFiles_projection = {}
        self.logFiles_projection = {}
        self.scriptFiles_projection = {}
        self.jobOptions_sbatch = {}

        self.inputFiles = {}
        self.outputFiles_tmp = {}
        self.outputFiles = {}

        self.phoniesToAdd = []
        self.filesToClean = []
        self.targets = []

        self.makefile_target = "sbatch_{}".format(self.projection_module)

        self.dirs = {}
        all_dirs = [
            DKEY_CFGS, DKEY_HISTO_TMP, DKEY_HISTO, DKEY_PLOTS, DKEY_LOGS,
            DKEY_SCRIPTS, DKEY_HADD_RT
        ]
        cfg_dirs = [
            DKEY_CFGS, DKEY_LOGS, DKEY_PLOTS, DKEY_SCRIPTS, DKEY_HADD_RT
        ]

        ref_genWeightsFile = os.path.join(
            os.environ['CMSSW_BASE'], 'src', 'tthAnalysis', 'HiggsToTauTau',
            'data', 'refGenWeight_{}.txt'.format(self.era))
        self.ref_genWeights = load_refGenWeightsFromFile(
            ref_genWeightsFile) if projection_module != 'puHist' else {}

        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue
            process_name = sample_info["process_name_specific"]
            key_dir = getKey(process_name)
            for dir_type in all_dirs:
                if dir_type == DKEY_PLOTS:
                    continue
                initDict(self.dirs, [key_dir, dir_type])
                if dir_type in cfg_dirs:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.configDir, dir_type, process_name)
                else:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.outputDir, dir_type, process_name)
        for dir_type in cfg_dirs:
            initDict(self.dirs, [dir_type])
            self.dirs[dir_type] = os.path.join(self.configDir, dir_type)

        self.cvmfs_error_log = {}
        self.num_jobs = {
            'hadd': 0,
            'project': 0,
            'plot': 0,
        }
Example #30
0
    def check_job_completion(self,
                             jobsId_list,
                             default_completion=Status.completed):
        completion = {
            k: JobCompletion(status=default_completion)
            for k in jobsId_list
        }

        # If the input list is empty, just return here (we don't want to mess up the subprocess commands here)
        if not completion:
            return completion

        # Set a delimiter, which distinguishes entries b/w different jobs
        delimiter = ','

        # First, let's try with sacct; explanation:
        # 1) sacct -X -P -n -o JobID,ExitCode,DerivedExitCode,State
        #      Shows job IDs, exit codes and comments of all submitted, running and finished jobs, one line per job
        #        a) -X -- shows cumulative statistics of each job (has no effect here, though)
        #        b) -P -- output will be '|' delimited without a '|' at the end
        #        c) -n -- omit header
        #        d) -o JobID,ExitCode,DerivedExitCode -- output format
        #        e) -S {datetime} -- look only for jobs submitted after {datetime}
        #        f) -j {jobs} -- filter out only the relevant jobs by their job ID (comma-separated list)
        # 2) sed ':a;N;$!ba;s/\\n/{delimiter}/g'
        #      Place all entries to one line, delimited by {{delimiter}} (otherwise the logs are hard to read)
        sacct_cmd = "sacct -X -P -n -o JobID,ExitCode,DerivedExitCode,State -S {datetime} -j {jobs} | " \
                    "sed ':a;N;$!ba;s/\\n/{delimiter}/g'".format(
          datetime  = self.datetime,
          jobs      = ','.join(jobsId_list),
          delimiter = delimiter,
        )
        sacct_out, sacct_err = run_cmd(sacct_cmd,
                                       do_not_log=not self.log_completion,
                                       return_stderr=True)
        if not sacct_err and sacct_out:
            # The output of sacct contains one line per job, each line has pipe-separated fields the order of which
            # is defined in the command that issued the output
            lines = sacct_out.split(delimiter)
            for line in lines:
                JobID, ExitCode, DerivedExitCode, State = line.split('|')
                if JobID in completion:
                    completion[JobID] = JobCompletion(
                        status=Status.classify_error(ExitCode, DerivedExitCode,
                                                     State),
                        exit_code=ExitCode,
                        derived_exit_code=DerivedExitCode,
                        state=State,
                    )
            return completion
        else:
            # Likely returned along the lines of (due to heavy load on the cluster since SQL DB is overloaded):
            # sacct: error: Problem talking to the database: Connection refused
            logging.info('sacct currently unavailable: %s' % sacct_err)

        # Let's try with scontrol if the sacct commands failed
        # scontrol doesn't have an option to take a list of Job IDs as an argument; thus, we have to grep the job IDs
        # Explanation:
        # 1) scontrol show -od job
        #      Prints out everything about running or recently finished jobs
        #        a) -o -- prints information one line per record
        #        b) -d -- includes more detailed information about the job
        #        c) job -- prints all jobs (it's possible to get information about other units like nodes and clusters)
        # 2) grep '{jobs}'
        #      Filter out jobs by their job ID (by concatenating the list with escaped regex OR operator '|')
        # 3) sed ':a;N;$!ba;s/\\n/{delimiter}/g'
        #      Put all the result on one line, where each record is delimited by {delimiter}
        scontrol_cmd = "scontrol show -od job | grep '{jobs}' | sed ':a;N;$!ba;s/\\n/{delimiter}/g'".format(
            jobs='\\|'.join(jobsId_list),
            delimiter=delimiter,
        )
        scontrol_out, scontrol_err = run_cmd(
            scontrol_cmd,
            do_not_log=not self.log_completion,
            return_stderr=True)
        if not scontrol_err and scontrol_out:
            # The output of scontrol contains one entry per line, each line contains a space-delimited key-value pairs,
            # whereas the keys and values are separated by an equation sign
            # Although the keys do not contain any spaces, the values might, so we have to take care of that
            lines = scontrol_out.split(delimiter)
            for line in lines:
                line_dict = {}
                line_split_eq_spaces = map(lambda x: x.split(),
                                           line.split('='))
                for i in range(len(line_split_eq_spaces) - 1):
                    k = line_split_eq_spaces[i]
                    v = line_split_eq_spaces[i + 1]
                    line_dict[k[-1]] = ' '.join(
                        v[:-1] if i != len(line_split_eq_spaces) - 2 else v)
                if not 'JobId' in line_dict.keys():
                    print("Skipping line = '%s'" % line)
                    continue
                JobId = line_dict['JobId']
                if JobId in completion:
                    completion[JobId] = JobCompletion(
                        status=Status.classify_error(
                            line_dict['ExitCode'],
                            line_dict['DerivedExitCode'],
                            line_dict['JobState'],
                        ),
                        exit_code=line_dict['ExitCode'],
                        derived_exit_code=line_dict['DerivedExitCode'],
                        state=line_dict['JobState'])
            return completion
        else:
            # scontrol probably returned something like:
            # slurm_load_jobs error: Invalid job id specified
            # Probably because too much time has passed since the job completion and checking the exit status here
            logging.info('scontrol has errors: %s' % scontrol_err)

        # scontrol still might fail if too much time has passed since the jobs completion (the metadata about each
        # job is cached for a certain period of time, the length of which I don't know at the moment)
        # None of the SLURM commands work; let's just say that the job completed successfully
        logging.error(
            "Cannot tell if the job has completed successfully or not!")
        return completion