Beispiel #1
0
    def addToMakefile_plot(self, lines_makefile):
        cmd_string = "plot_from_histogram.py -i %s -j %s -o %s -x '# PU interactions' " \
                     "-y '# events' -t '%s' -g"
        cmd_log_string = cmd_string + " -l"

        jobOptions = {}
        for key, cfg in self.outputFiles.items():
            plot_linear = os.path.join(self.dirs[DKEY_PLOTS], '%s.png' % key)
            plot_log = os.path.join(self.dirs[DKEY_PLOTS], '%s_log.png' % key)
            logFile_linear = os.path.join(self.dirs[DKEY_LOGS],
                                          'plot_linear_%s.log' % key)
            logFile_log = os.path.join(self.dirs[DKEY_LOGS],
                                       'plot_log_%s.log' % key)
            logFile_linear, logFile_log = get_log_version(
                (logFile_linear, logFile_log))
            jobOptions[key] = {
                'inputFile': cfg['outputFile'],
                'jobs': {
                    'linear': {
                        'outputFile':
                        plot_linear,
                        'cmd':
                        cmd_string %
                        (cfg['outputFile'], key, plot_linear, key),
                        'logFile':
                        logFile_linear,
                    },
                    'log': {
                        'outputFile':
                        plot_log,
                        'cmd':
                        cmd_log_string %
                        (cfg['outputFile'], key, plot_log, key),
                        'logFile':
                        logFile_log,
                    }
                }
            }
            plot_files = [
                jobOptions[key]['jobs'][plot_type]['outputFile']
                for plot_type in jobOptions[key]['jobs']
            ]
            self.filesToClean.extend(plot_files)
            self.targets.extend(plot_files)

        for cfg in jobOptions.values():
            for plot_cfg in cfg['jobs'].values():
                lines_makefile.extend([
                    "%s: %s" % (plot_cfg['outputFile'], cfg['inputFile']),
                    "\t%s &> %s" % (plot_cfg['cmd'], plot_cfg['logFile']),
                    "",
                ])
                self.num_jobs['plot'] += 1
Beispiel #2
0
    def addToMakefile_hadd(self, lines_makefile):
        """Add hadd targets to the Makefile
        """
        for hadd_out, hadd_in in self.hadd_records.iteritems():
            hadd_in_files = hadd_in['output_files']
            hadd_fileset_id = hadd_in['fileset_id']
            process_name = hadd_in['process_name']
            sbatch_hadd_file = os.path.join(
                self.cfgDir, DKEY_HADD, self.channel, process_name,
                "sbatch_hadd_cat_%s_%d.py" % (process_name, hadd_fileset_id))
            sbatch_hadd_shFile = os.path.join(
                self.cfgDir, DKEY_HADD, self.channel, process_name,
                "sbatch_hadd_cat_%s_%d.sh" % (process_name, hadd_fileset_id))
            sbatch_hadd_logFile = os.path.join(
                self.cfgDir, DKEY_HADD, self.channel, process_name,
                "sbatch_hadd_cat_%s_%d.log" % (process_name, hadd_fileset_id))
            sbatch_hadd_dir = os.path.join(
                self.cfgDir,
                DKEY_HADD_RT,
                self.channel,
                process_name,
            )
            sbatch_hadd_logFile = get_log_version((sbatch_hadd_logFile, ))
            tools_createScript_sbatch_hadd(
                sbatch_script_file_name=sbatch_hadd_file,
                input_file_names=hadd_in_files,
                output_file_name=hadd_out,
                script_file_name=sbatch_hadd_shFile,
                log_file_name=sbatch_hadd_logFile[0],
                working_dir=self.workingDir,
                waitForJobs=False,
                auxDirName=sbatch_hadd_dir,
                pool_id=self.pool_id,
                use_home=self.use_home,
            )

            lines_makefile.append("%s: %s" %
                                  (hadd_out, " ".join(hadd_in_files)))
            lines_makefile.append("\t%s %s" % ("rm -f", hadd_out))
            lines_makefile.append("\t%s %s" % ("python", sbatch_hadd_file))
            lines_makefile.append("")
Beispiel #3
0
    def __init__(
        self,
        configDir,
        outputDir,
        output_file,
        executable,
        projection_module,
        samples,
        max_files_per_job,
        era,
        plot,
        check_output_files,
        running_method,
        num_parallel_jobs,
        pool_id='',
        verbose=False,
        dry_run=False,
        use_home=False,
        submission_cmd=None,
    ):

        self.configDir = configDir
        self.outputDir = outputDir
        self.executable = executable
        self.projection_module = projection_module
        self.max_num_jobs = 200000
        self.samples = samples
        self.max_files_per_job = max_files_per_job
        self.era = era
        self.plot = plot
        self.check_output_files = check_output_files
        self.verbose = verbose
        self.dry_run = dry_run
        self.use_home = use_home
        if running_method.lower() not in ["sbatch", "makefile"]:
            raise ValueError("Invalid running method: %s" % running_method)

        self.running_method = running_method
        self.is_sbatch = self.running_method.lower() == "sbatch"
        self.is_makefile = not self.is_sbatch
        self.makefile = os.path.join(
            self.configDir, "Makefile_{}".format(self.projection_module))
        self.num_parallel_jobs = num_parallel_jobs
        self.pool_id = pool_id if pool_id else uuid.uuid4()

        self.workingDir = os.getcwd()
        logging.info("Working directory is: %s" % self.workingDir)
        self.template_dir = os.path.join(os.getenv('CMSSW_BASE'), 'src',
                                         'tthAnalysis', 'HiggsToTauTau',
                                         'test', 'templates')
        logging.info("Templates directory is: %s" % self.template_dir)

        create_if_not_exists(self.configDir)
        create_if_not_exists(self.outputDir)
        self.output_file = os.path.join(self.outputDir, output_file)
        self.stdout_file_path = os.path.join(
            self.configDir, "stdout_{}.log".format(self.projection_module))
        self.stderr_file_path = os.path.join(
            self.configDir, "stderr_{}.log".format(self.projection_module))
        self.sw_ver_file_cfg = os.path.join(
            self.configDir, "VERSION_{}.log".format(self.projection_module))
        self.sw_ver_file_out = os.path.join(
            self.outputDir, "VERSION_{}.log".format(self.projection_module))
        self.submission_out = os.path.join(self.configDir, "SUBMISSION.log")
        self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out = get_log_version(
            (self.stdout_file_path, self.stderr_file_path,
             self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out))
        check_submission_cmd(self.submission_out, submission_cmd)

        self.sbatchFile_projection = os.path.join(
            self.configDir, "sbatch_{}.py".format(self.projection_module))
        self.cfgFiles_projection = {}
        self.logFiles_projection = {}
        self.scriptFiles_projection = {}
        self.jobOptions_sbatch = {}

        self.inputFiles = {}
        self.outputFiles_tmp = {}
        self.outputFiles = {}

        self.phoniesToAdd = []
        self.filesToClean = []
        self.targets = []

        self.makefile_target = "sbatch_{}".format(self.projection_module)

        self.dirs = {}
        all_dirs = [
            DKEY_CFGS, DKEY_HISTO_TMP, DKEY_HISTO, DKEY_PLOTS, DKEY_LOGS,
            DKEY_SCRIPTS, DKEY_HADD_RT
        ]
        cfg_dirs = [
            DKEY_CFGS, DKEY_LOGS, DKEY_PLOTS, DKEY_SCRIPTS, DKEY_HADD_RT
        ]

        ref_genWeightsFile = os.path.join(
            os.environ['CMSSW_BASE'], 'src', 'tthAnalysis', 'HiggsToTauTau',
            'data', 'refGenWeight_{}.txt'.format(self.era))
        self.ref_genWeights = load_refGenWeightsFromFile(
            ref_genWeightsFile) if projection_module != 'puHist' else {}

        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue
            process_name = sample_info["process_name_specific"]
            key_dir = getKey(process_name)
            for dir_type in all_dirs:
                if dir_type == DKEY_PLOTS:
                    continue
                initDict(self.dirs, [key_dir, dir_type])
                if dir_type in cfg_dirs:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.configDir, dir_type, process_name)
                else:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.outputDir, dir_type, process_name)
        for dir_type in cfg_dirs:
            initDict(self.dirs, [dir_type])
            self.dirs[dir_type] = os.path.join(self.configDir, dir_type)

        self.cvmfs_error_log = {}
        self.num_jobs = {
            'hadd': 0,
            'project': 0,
            'plot': 0,
        }
    def __init__(
        self,
        configDir,
        outputDir,
        cfgFile_prodNtuple,
        samples,
        max_files_per_job,
        era,
        preselection_cuts,
        leptonSelection,
        hadTauWP,
        check_output_files,
        running_method,
        version,
        num_parallel_jobs,
        pileup,
        golden_json,
        dry_run,
        isDebug,
        gen_matching_by_index,
        use_nonnominal,
        use_home,
        skip_tools_step,
        verbose=False,
        pool_id='',
    ):

        self.configDir = configDir
        self.outputDir = outputDir
        self.max_num_jobs = 200000
        self.samples = samples
        self.max_files_per_job = max_files_per_job
        self.era = era
        self.preselection_cuts = preselection_cuts
        self.leptonSelection = leptonSelection
        self.hadTauWP = hadTauWP
        self.check_output_files = check_output_files
        self.verbose = verbose
        self.dry_run = dry_run
        self.isDebug = isDebug
        self.gen_matching_by_index = gen_matching_by_index
        self.use_nonnominal = use_nonnominal
        self.use_home = use_home
        self.pileup = pileup
        self.golden_json = golden_json
        if running_method.lower() not in ["sbatch", "makefile"]:
            raise ValueError("Invalid running method: %s" % running_method)

        if not os.path.isfile(self.pileup):
            raise ValueError('No such file: %s' % self.pileup)
        self.pileup_histograms = get_pileup_histograms(self.pileup)

        if not os.path.isfile(self.golden_json):
            raise ValueError('No such file: %s' % self.golden_json)

        self.running_method = running_method
        self.is_sbatch = self.running_method.lower() == "sbatch"
        self.is_makefile = not self.is_sbatch
        self.makefile = os.path.join(self.configDir, "Makefile_prodNtuple")
        self.num_parallel_jobs = num_parallel_jobs
        self.skip_tools_step = skip_tools_step
        self.pool_id = pool_id if pool_id else uuid.uuid4()

        self.workingDir = os.getcwd()
        logging.info("Working directory is: %s" % self.workingDir)
        self.template_dir = os.path.join(os.getenv('CMSSW_BASE'), 'src',
                                         'tthAnalysis', 'HiggsToTauTau',
                                         'test', 'templates')
        logging.info("Templates directory is: %s" % self.template_dir)

        self.version = version
        self.samples = samples

        create_if_not_exists(self.configDir)
        create_if_not_exists(self.outputDir)
        self.stdout_file_path = os.path.join(self.configDir,
                                             "stdout_prodNtuple.log")
        self.stderr_file_path = os.path.join(self.configDir,
                                             "stderr_prodNtuple.log")
        self.sw_ver_file_cfg = os.path.join(self.configDir,
                                            "VERSION_prodNtuple.log")
        self.sw_ver_file_out = os.path.join(self.outputDir,
                                            "VERSION_prodNtuple.log")
        self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out = get_log_version(
            (self.stdout_file_path, self.stderr_file_path,
             self.sw_ver_file_cfg, self.sw_ver_file_out))

        self.cfgFile_prodNtuple_original = os.path.join(
            self.template_dir, cfgFile_prodNtuple)
        self.sbatchFile_prodNtuple = os.path.join(self.configDir,
                                                  "sbatch_prodNtuple.py")
        self.cfgFiles_prodNtuple_modified = {}
        self.logFiles_prodNtuple = {}

        self.inputFiles = {}
        self.outputFiles = {}
        self.filesToClean = []
        self.dirs = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue
            process_name = sample_info["process_name_specific"]
            key_dir = getKey(sample_name)
            for dir_type in [DKEY_CFGS, DKEY_NTUPLES, DKEY_LOGS]:
                initDict(self.dirs, [key_dir, dir_type])
                if dir_type in [DKEY_CFGS, DKEY_LOGS]:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.configDir, dir_type, process_name)
                else:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.outputDir, dir_type, process_name)
        for dir_type in [DKEY_CFGS, DKEY_LOGS]:
            initDict(self.dirs, [dir_type])
            if dir_type in [DKEY_CFGS, DKEY_LOGS]:
                self.dirs[dir_type] = os.path.join(self.configDir, dir_type)
            else:
                self.dirs[dir_type] = os.path.join(self.outputDir, dir_type)

        self.cvmfs_error_log = {}
        self.executable = "produceNtuple.sh"
Beispiel #5
0
    def submitJob(
        self,
        inputFiles,
        executable,
        command_line_parameter,
        outputFilePath,
        outputFiles,
        scriptFile,
        logFile=None,
        skipIfOutputFileExists=False,
        job_template_file='sbatch-node.sh.template',
        copy_output_file=True,
        nof_submissions=0,
    ):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """

        logging.debug("<sbatchManager::submitJob>: job_template_file = '%s'" %
                      job_template_file)

        job_template_file = os.path.join(jinja_template_dir, job_template_file)
        job_template = open(job_template_file, 'r').read()

        # raise if logfile missing
        if not logFile:
            if not self.logFileDir:
                raise ValueError(
                    "Please call 'setLogFileDir' before calling 'submitJob' !!"
                )
            logFile = os.path.join(
                self.logFileDir,
                os.path.basename(scriptFile).replace(".sh", ".log"))

        # skip only if none of the output files are missing in the file system
        outputFiles_fullpath = map(
            lambda outputFile: os.path.join(outputFilePath, outputFile),
            outputFiles)
        if skipIfOutputFileExists:
            outputFiles_missing = [
                outputFile for outputFile in outputFiles_fullpath \
                if not is_file_ok(outputFile, validate_outputs = True, min_file_size = self.min_file_size)
            ]
            if not outputFiles_missing:
                logging.debug(
                  "output file(s) = %s exist(s) --> skipping !!" % \
                  '; '.join(map(lambda x: "'%s'" % x, outputFiles_fullpath))
                )
                return

        if not self.workingDir:
            raise ValueError(
                "Please call 'setWorkingDir' before calling 'submitJob' !!")

        if not self.cmssw_base_dir:
            logging.warning("cmssw_base_dir not set, setting it to '%s'" %
                            os.environ.get('CMSSW_BASE'))
            self.cmssw_base_dir = os.environ.get('CMSSW_BASE')

        job_dir = self.get_job_dir()

        # create script for executing jobs
        wrapper_log_file = logFile.replace('.log', '_wrapper.log')
        executable_log_file = logFile.replace('.log', '_executable.log')
        wrapper_log_file, executable_log_file = get_log_version(
            (wrapper_log_file, executable_log_file))

        sbatch_command = "sbatch --partition={partition} --output={output} --comment='{comment}' " \
                         "{max_mem} {args} {cmd}".format(
          partition = self.queue,
          output    = wrapper_log_file,
          comment   = self.pool_id,
          args      = self.sbatchArgs,
          cmd       = scriptFile,
          max_mem   = '--mem={}'.format(self.max_mem) if self.max_mem else '',
        )

        two_pow_sixteen = 65536
        random.seed((abs(hash(command_line_parameter))) % two_pow_sixteen)
        max_delay = 60
        random_delay = random.randint(0, max_delay)

        script = jinja2.Template(job_template).render(
            working_dir=self.workingDir,
            cmssw_base_dir=self.cmssw_base_dir,
            job_dir=job_dir,
            job_template_file=job_template_file,
            exec_name=executable,
            command_line_parameter=command_line_parameter,
            inputFiles=" ".join(inputFiles),
            outputDir=outputFilePath,
            outputFiles=" ".join(outputFiles),
            wrapper_log_file=wrapper_log_file,
            executable_log_file=executable_log_file,
            script_file=scriptFile,
            RUNNING_COMMAND=sbatch_command,
            random_sleep=random_delay,
            copy_output_file=copy_output_file,
        )
        logging.debug("writing sbatch script file = '%s'" % scriptFile)
        with codecs.open(scriptFile, "w", "utf-8") as f:
            f.write(script)
            f.flush()
            os.fsync(f.fileno())

        if self.dry_run:
            return

        nof_submissions += 1
        job = {
            'sbatch_command':
            sbatch_command,
            'status':
            Status.in_queue,
            'log_wrap':
            wrapper_log_file,
            'log_exec':
            executable_log_file,
            'args': (
                inputFiles,
                executable,
                command_line_parameter,
                outputFilePath,
                outputFiles,
                scriptFile,
                logFile,
                skipIfOutputFileExists,
                job_template_file,
                nof_submissions,
            ),
            'nof_submissions':
            nof_submissions,
            'outputFiles':
            outputFiles_fullpath,
        }
        self.queuedJobs.append(job)
Beispiel #6
0
    def create(self):
        """Creates all necessary config files and runs the MEM -- either locally or on the batch system
        """

        for key in self.dirs.keys():
            if type(self.dirs[key]) == dict:
                for dir_type in self.dirs[key].keys():
                    create_if_not_exists(self.dirs[key][dir_type])
            else:
                create_if_not_exists(self.dirs[key])

        # read the file in, sample-by-sample
        # build the dictionary recursively
        # add rle file also to generated cfg files
        # print integrations per job as well!
        # consider more than 1 file per jobs -- the jobs are splitted by MEM integration anyways

        rle_filters = self.get_filter() if self.rle_filter_file else {}
        statistics = {}
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue

            if not os.path.exists(sample_info['local_paths'][0]['path']):
                logging.warning("Skipping sample {sample_name}".format(sample_name = sample_name))
                continue

            process_name = sample_info["process_name_specific"]
            logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_addMEM, process_name))
            is_mc = (sample_info["type"] == "mc")
            if self.rle_filter_file:
                assert(process_name in rle_filters)

            inputFileList = generateInputFileList(sample_info, self.max_files_per_job)
            # typically, the analysis ends here and starts looping b/c the smallest unit of work processes
            # at least one file; we need, however, to split the file into event ranges in such a way that
            # each job performs mem_integrations_per_job MEM integrations

            # so what we are going to do is to open each set of files in inputFileList, read the variable
            # requestMEM_*l_*tau and try to gather the event ranges such that each event range
            # performs up to mem_integrations_per_job integrations per job
            memEvtRangeDict = self.memJobList(inputFileList, rle_filters[process_name] if self.rle_filter_file else [])

            for jobId in memEvtRangeDict.keys():

                key_dir = getKey(sample_name)
                key_file = getKey(sample_name, jobId)

                self.inputFiles[key_file] = memEvtRangeDict[jobId]['input_fileset']

                # there should always be a job
                assert(self.inputFiles[key_file] > 0), "More than one input file: %s ?? !!" % \
                                                       ', '.join(self.inputFiles[key_file])

                #assert(len(self.inputFiles[key_file]) == 1), "There is more than one input file!"
                self.cfgFiles_addMEM_modified[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i_cfg.py" % (self.channel, process_name, jobId)
                )
                self.shFiles_addMEM_modified[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i.sh" % (self.channel, process_name, jobId)
                )
                self.outputFiles[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_NTUPLES], "%s_%i.root" % (process_name, jobId)
                )
                self.logFiles_addMEM[key_file] = os.path.join(
                    self.dirs[key_dir][DKEY_LOGS], "addMEM_%s_%s_%i.log" % (self.channel, process_name, jobId)
                )
                self.logFiles_addMEM[key_file] = get_log_version((self.logFiles_addMEM[key_file],))[0]
                self.createCfg_addMEM(
                    self.inputFiles[key_file],
                    memEvtRangeDict[jobId]['event_range'][0],
                    memEvtRangeDict[jobId]['event_range'][1],
                    self.outputFiles[key_file],
                    self.era,
                    sample_info["sample_category"],
                    is_mc,
                    self.cfgFiles_addMEM_modified[key_file],
                    memEvtRangeDict[jobId]['whitelist'],
                )

                # associate the output file with the fileset_id
                #UDPATE: ONE OUTPUT FILE PER SAMPLE!
                fileset_id = memEvtRangeDict[jobId]['fileset_id']
                hadd_output_dir = os.path.join(
                    self.dirs[key_dir][DKEY_FINAL_NTUPLES],
                    '%04d' % (fileset_id // 1000)
                )
                if not os.path.exists(hadd_output_dir):
                    os.makedirs(hadd_output_dir)
                hadd_output = os.path.join(
                    hadd_output_dir, '%s_%i.root' % ('tree', fileset_id) # UDPATE: ADDED
                    #hadd_output_dir, "tree.root" # UDPATE: REMOVED
                )
                if hadd_output not in self.hadd_records:
                    self.hadd_records[hadd_output] = {}
                    self.hadd_records[hadd_output]['output_files'] = []
                self.hadd_records[hadd_output]['fileset_id'] = fileset_id
                self.hadd_records[hadd_output]['output_files'].append(self.outputFiles[key_file])
                self.hadd_records[hadd_output]['process_name'] = process_name
                #self.filesToClean.append(self.outputFiles[key_file])

            # let's sum the number of integration per sample
            nofEntriesMap = {}
            for v in memEvtRangeDict.values():
                if v['fileset_id'] not in nofEntriesMap:
                    nofEntriesMap[v['fileset_id']] = {
                        'nof_entries' : v['nof_entries'],
                    }
            statistics[process_name] = {
                'nof_int'         : sum([entry['nof_int']         for entry in memEvtRangeDict.values()]),
                'nof_entries'     : sum([entry['nof_entries']     for entry in nofEntriesMap.values()]),
                'nof_events_pass' : sum([entry['nof_events_pass'] for entry in memEvtRangeDict.values()]),
                'nof_int_pass'    : sum([entry['nof_int_pass']    for entry in memEvtRangeDict.values()]),
                'nof_zero'        : sum([entry['nof_zero']        for entry in memEvtRangeDict.values()]),
                'nof_jobs'        : len(memEvtRangeDict),
            }

        if self.is_sbatch:
            logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addMEM)
            self.createScript_sbatch()

        logging.info("Creating Makefile")
        lines_makefile = []
        self.addToMakefile_addMEM(lines_makefile)
        self.addToMakefile_hadd(lines_makefile)
        self.createMakefile(lines_makefile)

        ws_len = max([len(kk) + 1 for kk in statistics.keys()])
        total_nof_integrations_sum = sum(x['nof_int']            for x in statistics.values())
        total_nof_entires          = sum(x['nof_entries']        for x in statistics.values())
        total_nof_zero_int         = sum(x['nof_zero']           for x in statistics.values())
        total_nof_jobs             = sum(x['nof_jobs']           for x in statistics.values())
        total_nof_pass             = sum(x['nof_events_pass']    for x in statistics.values())
        total_nof_int_pass_avg     = float(sum(x['nof_int_pass'] for x in statistics.values())) / total_nof_pass
        total_nof_integrations_avg = float(total_nof_integrations_sum) / total_nof_entires
        total_nof_int_per_job = float(total_nof_integrations_sum) / total_nof_jobs
        for k, v in statistics.iteritems():
            if v['nof_entries'] == 0:
                int_per_event = 0.
                evt_pass = 0.
            else:
                int_per_event = float(v['nof_int']) / v['nof_entries']
                evt_pass = (100 * float(v['nof_events_pass']) / v['nof_entries'])
            if v['nof_events_pass'] == 0:
                nof_int_pass = 0.
            else:
                nof_int_pass = float(v['nof_int_pass']) / v['nof_events_pass']
            print('%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d (%.2f%%) evt pass; %.2f int/evt pass; %d evt 0int)' %
              (k,
               ' ' * (ws_len - len(k)),
               v['nof_int'],
               v['nof_entries'],
               v['nof_jobs'],
               int_per_event,
               v['nof_events_pass'],
               evt_pass,
               nof_int_pass,
               v['nof_zero'],
              )
            )
        print('%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d evt pass; %.2f int/evt pass; '
              '%.2f int/job pass; %d evt 0int)' %
          ('total',
           ' ' * (ws_len - len('total')),
           total_nof_integrations_sum,
           total_nof_entires,
           total_nof_jobs,
           total_nof_integrations_avg,
           total_nof_pass,
           total_nof_int_pass_avg,
           total_nof_int_per_job,
           total_nof_zero_int,
          )
        )

        if self.max_mem_integrations > 0 and total_nof_integrations_sum > self.max_mem_integrations:
            logging.error("Will not start the jobs (max nof integrations exceeded)!")
            return False
        else:
            logging.info("Done")
            return True
Beispiel #7
0
    def __init__(self,
            treeName,
            outputDir,
            cfgDir,
            executable_addMEM,
            samples,
            era,
            check_output_files,
            running_method,
            max_files_per_job,
            mem_integrations_per_job,
            max_mem_integrations,
            num_parallel_jobs,
            leptonSelection,
            hadTauSelection,
            integration_choice,
            jet_cleaning_by_index,
            dry_run,
            use_nonnominal,
            use_home,
            channel,
            rle_filter_file = '',
            submission_cmd = None,
            pool_id = '',
            max_jobs_per_sample = -1,
          ):

        self.treeName = treeName
        self.outputDir = outputDir
        self.cfgDir = cfgDir
        self.executable_addMEM = executable_addMEM
        self.mem_integrations_per_job = mem_integrations_per_job
        self.max_files_per_job = max_files_per_job
        self.max_mem_integrations = max_mem_integrations
        self.max_jobs_per_sample = max_jobs_per_sample
        self.samples = samples
        self.era = era
        self.check_output_files = check_output_files
        self.channel = channel
        self.rle_filter_file = rle_filter_file
        self.leptonSelection = leptonSelection
        self.hadTauSelection = hadTauSelection
        if self.hadTauSelection:
            self.hadTauDefinition = self.hadTauSelection.split('|')[0]
            self.hadTauWorkingPoint = self.hadTauSelection.split('|')[1]
        else:
            self.hadTauDefinition = None
            self.hadTauWorkingPoint = None
        self.maxPermutations_branchName = None
        self.integration_choice = integration_choice
        self.jet_cleaning_by_index = jet_cleaning_by_index
        logging.info(
            "Number of integration points: %s" % self.integration_choice
        )
        if running_method.lower() not in ["sbatch", "makefile"]:
            raise ValueError("Invalid running method: %s" % running_method)
        self.running_method = running_method
        self.is_sbatch = False
        self.is_makefile = False
        if self.running_method.lower() == "sbatch":
            self.is_sbatch = True
        else:
            self.is_makefile = True
        self.makefile = os.path.join(
          self.cfgDir, "Makefile_%s" % self.channel)
        self.num_parallel_jobs = num_parallel_jobs
        self.dry_run = dry_run
        self.use_nonnominal = use_nonnominal
        self.use_home = use_home
        self.pool_id = pool_id if pool_id else uuid.uuid4()

        self.workingDir = os.getcwd()
        logging.info("Working directory is: {workingDir}".format(workingDir = self.workingDir))

        for dirPath in [self.outputDir, self.cfgDir]:
          create_if_not_exists(dirPath)

        self.stdout_file_path = os.path.join(self.cfgDir, "stdout_%s.log" % self.channel)
        self.stderr_file_path = os.path.join(self.cfgDir, "stderr_%s.log" % self.channel)
        self.sw_ver_file_cfg  = os.path.join(self.cfgDir, "VERSION_%s.log" % self.channel)
        self.sw_ver_file_out  = os.path.join(self.outputDir, "VERSION_%s.log" % self.channel)
        self.submission_out   = os.path.join(self.cfgDir, "SUBMISSION_%s.log" % self.channel)
        self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out = get_log_version((
            self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out
        ))
        check_submission_cmd(self.submission_out, submission_cmd)

        self.dirs = {}
        self.samples = samples
        self.cfgFiles_addMEM_modified = {}
        self.shFiles_addMEM_modified = {}
        self.logFiles_addMEM = {}
        self.sbatchFile_addMEM = os.path.join(self.cfgDir, "sbatch_addMEM_%s.py" % self.channel)
        self.inputFiles = {}
        self.outputFiles = {}
        self.hadd_records = {}
        self.filesToClean = []

        del self.samples['sum_events']
        for sample_name, sample_info in self.samples.items():
            if not sample_info["use_it"]:
                continue
            process_name = sample_info["process_name_specific"]
            key_dir = getKey(sample_name)
            for dir_type in [DKEY_NTUPLES, DKEY_FINAL_NTUPLES]:
                initDict(self.dirs, [key_dir, dir_type])
                self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, process_name)
            for dir_type in [DKEY_CFGS, DKEY_LOGS, DKEY_HADD, DKEY_HADD_RT]:
                initDict(self.dirs, [key_dir, dir_type])
                self.dirs[key_dir][dir_type] = os.path.join(self.cfgDir, dir_type, self.channel, process_name)

        self.cvmfs_error_log = {}
    def __init__(
        self,
        configDir,
        localDir,
        outputDir,
        output_file,
        executable,
        samples,
        max_files_per_job,
        era,
        binning,
        use_gen_weight,
        check_output_files,
        running_method,
        num_parallel_jobs,
        pool_id='',
        verbose=False,
        dry_run=False,
        use_home=False,
        keep_logs=False,
        submission_cmd=None,
    ):

        self.configDir = configDir
        self.localDir = localDir
        self.outputDir = outputDir
        self.executable = executable
        self.max_num_jobs = 200000
        self.samples = samples
        self.max_files_per_job = max_files_per_job
        self.era = era
        self.binning = binning
        self.use_gen_weight = use_gen_weight
        self.check_output_files = check_output_files
        self.verbose = verbose
        self.dry_run = dry_run
        self.use_home = use_home
        self.keep_logs = keep_logs
        if running_method.lower() not in ["sbatch", "makefile"]:
            raise ValueError("Invalid running method: %s" % running_method)

        self.running_method = running_method
        self.is_sbatch = self.running_method.lower() == "sbatch"
        self.is_makefile = not self.is_sbatch
        self.makefile = os.path.join(self.localDir, "Makefile_nonResDenom")
        self.num_parallel_jobs = num_parallel_jobs
        self.pool_id = pool_id if pool_id else uuid.uuid4()

        self.workingDir = os.getcwd()
        logging.info("Working directory is: %s" % self.workingDir)
        self.template_dir = os.path.join(os.getenv('CMSSW_BASE'), 'src',
                                         'tthAnalysis', 'HiggsToTauTau',
                                         'test', 'templates')
        logging.info("Templates directory is: %s" % self.template_dir)

        create_if_not_exists(self.configDir)
        create_if_not_exists(self.localDir)
        create_if_not_exists(self.outputDir)
        self.output_file = os.path.join(self.outputDir, output_file)
        self.stdout_file_path = os.path.join(self.localDir,
                                             "stdout_nonResDenom.log")
        self.stderr_file_path = os.path.join(self.localDir,
                                             "stderr_nonResDenom.log")
        self.sw_ver_file_cfg = os.path.join(self.localDir,
                                            "VERSION_nonResDenom.log")
        self.sw_ver_file_out = os.path.join(self.outputDir,
                                            "VERSION_nonResDenom.log")
        self.submission_out = os.path.join(self.localDir,
                                           "SUBMISSION_nonResDenom.log")
        self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out = get_log_version(
            (self.stdout_file_path, self.stderr_file_path,
             self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out))
        check_submission_cmd(self.submission_out, submission_cmd)

        self.sbatchFile_nonResDenom = os.path.join(self.localDir,
                                                   "sbatch_nonResDenom.py")
        self.cfgFiles_nonResDenom = {}
        self.logFiles_nonResDenom = {}
        self.scriptFiles_nonResDenom = {}
        self.jobOptions_sbatch = {}

        self.inputFiles = {}
        self.outputFiles_tmp = {}
        self.outputFiles = {}

        self.phoniesToAdd = []
        self.filesToClean = [self.configDir]
        self.targets = []

        self.dirs = {}
        all_dirs = [
            DKEY_CFGS, DKEY_HISTO_TMP, DKEY_HISTO, DKEY_PLOTS, DKEY_LOGS,
            DKEY_SCRIPTS, DKEY_HADD_RT
        ]
        cfg_dirs = [
            DKEY_CFGS, DKEY_LOGS, DKEY_PLOTS, DKEY_SCRIPTS, DKEY_HADD_RT
        ]

        self.gen_weights = {}
        if self.use_gen_weight:
            ref_genweights = os.path.join(os.environ['CMSSW_BASE'], 'src',
                                          'tthAnalysis', 'HiggsToTauTau',
                                          'data',
                                          'refGenWeight_{}.txt'.format(era))
            with open(ref_genweights, 'r') as f:
                for line in f:
                    line_split = line.strip().split()
                    assert (len(line_split) == 2)
                    sample_name = line_split[0]
                    ref_genweight = float(line_split[1])
                    assert (sample_name not in self.gen_weights)
                    self.gen_weights[sample_name] = ref_genweight

        for sample_name, sample_info in self.samples.items():
            if not sample_info['use_it']:
                continue
            process_name = sample_info["process_name_specific"]
            if self.use_gen_weight:
                assert (re.sub('_duplicate$', '', process_name)
                        in self.gen_weights)
            key_dir = getKey(process_name)
            for dir_type in all_dirs:
                if dir_type == DKEY_PLOTS:
                    continue
                initDict(self.dirs, [key_dir, dir_type])
                if dir_type in cfg_dirs:
                    dir_choice = self.configDir if dir_type == DKEY_CFGS else self.localDir
                    self.dirs[key_dir][dir_type] = os.path.join(
                        dir_choice, dir_type, process_name)
                else:
                    self.dirs[key_dir][dir_type] = os.path.join(
                        self.outputDir, dir_type, process_name)
        for dir_type in cfg_dirs:
            initDict(self.dirs, [dir_type])
            dir_choice = self.configDir if dir_type == DKEY_CFGS else self.localDir
            self.dirs[dir_type] = os.path.join(dir_choice, dir_type)
            if dir_choice != self.configDir:
                self.filesToClean.append(self.dirs[dir_type])

        self.cvmfs_error_log = {}
        self.num_jobs = {
            'hadd': 0,
            'nonResDenom': 0,
            'plot': 0,
        }
Beispiel #9
0
    def __init__(
        self,
        config_dir,
        output_dir,
        output_filename,
        version,
        era,
        channels,
        dry_run,
        check_output_files,
        running_method,
        isDebug,
        rle_select,
        with_mem,
        use_nonnominal,
        hlt_filter,
        tau_id_wp,
        tau_id,
        use_home,
        systematics_label,
        use_preselected,
        jet_cleaning,
        gen_matching,
        regroup_jerc=False,
        project_dir=os.path.join(os.getenv('CMSSW_BASE'), 'src', 'tthAnalysis',
                                 'HiggsToTauTau'),
        file_pattern='tthAnalyzeRun_%s.py',
        suffix='',
        submission_cmd=None,
        mode=None,
    ):

        self.running_method = running_method
        self.dry_run = dry_run
        self.check_output_files = check_output_files
        self.use_home = use_home
        executable_pattern = os.path.join(project_dir, 'test', file_pattern)

        self.config_dir = config_dir
        self.hadd_script_dir_path = os.path.join(self.config_dir, DKEY_SCRIPTS,
                                                 DKEY_SYNC)
        self.hadd_log_dir_path = os.path.join(
            self.config_dir,
            DKEY_LOGS,
            DKEY_SYNC,
        )
        self.hadd_script_path = os.path.join(self.hadd_script_dir_path,
                                             'hadd_sync.py')
        self.hadd_log_wrapper_path = os.path.join(self.hadd_log_dir_path,
                                                  'hadd_sync_wrapper.log')
        self.hadd_log_executable_path = os.path.join(
            self.hadd_log_dir_path, 'hadd_sync_executable.log')

        self.output_dir = output_dir
        final_output_dir = os.path.join(self.output_dir, DKEY_SYNC)
        self.final_output_file = os.path.join(final_output_dir,
                                              output_filename)

        create_if_not_exists(self.config_dir)
        create_if_not_exists(self.output_dir)

        submission_out = os.path.join(self.config_dir, "SUBMISSION_sync.log")
        submission_out, = get_log_version((submission_out, ))
        check_submission_cmd(submission_out, submission_cmd)

        systematic_labels = ' '.join(systematics_label)
        common_args = "-v %s -e %s -s %s -y %s " % (
            version, era, systematic_labels, use_home)
        if jet_cleaning:
            common_args += " -q %s " % jet_cleaning
        if gen_matching:
            common_args += " -g %s " % gen_matching
        additional_args = " -E"
        if self.dry_run:
            additional_args += " -d"
        if check_output_files:
            additional_args += " -C"
        if isDebug:
            additional_args += " -D"
        if rle_select:
            additional_args += " -S '%s'" % rle_select
        if use_nonnominal:
            additional_args += " -O"
        if hlt_filter:
            additional_args += " -H"
        if tau_id:
            additional_args += " -t %s" % tau_id
        if tau_id_wp:
            additional_args += " -w %s" % tau_id_wp
        if self.running_method:
            additional_args += " -R %s" % self.running_method
        if regroup_jerc:
            additional_args += " -G"

        mem_channels = ['2lss_1tau', '3l', 'hh_bb2l']
        cr_channels = ['3l', '4l']

        inclusive_args = '-v %s -e %s' % (version, era)
        if systematic_labels != 'internal':
            inclusive_args += ' -s %s' % systematic_labels

        inclusive_args += additional_args
        common_args += additional_args

        channels_extended = collections.OrderedDict()
        for channel in channels:
            channels_extended[channel] = ''
            if channel in cr_channels:
                channels_extended[channel + 'ctrl'] = ' -c'

        self.channels_to_validate = []
        self.channel_info = {}
        for channel in channels_extended:
            if channel not in ['ttWctrl', 'ttZctrl', 'WZctrl', 'ZZctrl'
                               ] and 'inclusive' not in channel:
                self.channels_to_validate.append(channel)

            input_file = os.path.join(final_output_dir, '%s.root' % channel)
            executable_channel = channel
            if channel.replace('ctrl', '') in cr_channels:
                executable_channel = channel.replace('ctrl', '')
            channel_script = executable_pattern % executable_channel

            channel_makefile = os.path.join(self.config_dir,
                                            'Makefile_%s' % channel)
            channel_outlog = os.path.join(self.config_dir,
                                          'stdout_sync_%s.log' % channel)
            channel_errlog = os.path.join(self.config_dir,
                                          'stderr_sync_%s.log' % channel)
            channel_outlog_create = os.path.join(
                self.config_dir, 'stdout_sync_create_%s.log' % channel)
            channel_errlog_create = os.path.join(
                self.config_dir, 'stderr_sync_create_%s.log' % channel)
            channel_outlog, channel_errlog, channel_outlog_create, channel_errlog_create = get_log_version(
                (
                    channel_outlog,
                    channel_errlog,
                    channel_outlog_create,
                    channel_errlog_create,
                ))

            cmd_args = common_args if 'inclusive' not in channel else inclusive_args
            if 'inclusive' not in channel:
                cmd_args += " -p %s" % use_preselected

            mode_str = ''
            if mode:
                mode_str = '{}_sync'.format(mode)
            elif 'inclusive' not in channel:
                mode_str = 'sync'
            if mode_str and with_mem and channel in mem_channels:
                mode_str = '{}_wMEM'.format(mode_str)
            if mode_str:
                cmd_args += ' -m %s' % mode_str
            cmd_args += channels_extended[channel]

            channel_cmd_create = '%s %s 2>%s 1>%s' % \
                                 (channel_script, cmd_args, channel_errlog_create, channel_outlog_create)
            channel_cmd_run = '$(MAKE) -j 5 -f %s all 2>%s 1>%s' % (
                channel_makefile, channel_errlog, channel_outlog)
            channel_cmd_clean = '$(MAKE) -f %s clean' % channel_makefile
            if self.running_method.lower() == "makefile":
                channel_cmd_run = "\n\t".join([
                    "mkdir -p {}".format(channel),
                    channel_cmd_run.replace('$(MAKE)',
                                            '$(MAKE) -C {}'.format(channel)),
                    "rm -r {}".format(channel),
                ])
            self.channel_info[input_file] = {
                'create': channel_cmd_create,
                'run': channel_cmd_run,
                'clean': channel_cmd_clean,
            }

        self.stdout_file_path = os.path.join(self.config_dir,
                                             "stdout_sync.log")
        self.stderr_file_path = os.path.join(self.config_dir,
                                             "stderr_sync.log")
        self.sw_ver_file_cfg = os.path.join(self.config_dir,
                                            "VERSION_sync.log")
        self.sw_ver_file_out = os.path.join(self.output_dir,
                                            "VERSION_sync.log")
        self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out = get_log_version(
            (self.stdout_file_path, self.stderr_file_path,
             self.sw_ver_file_cfg, self.sw_ver_file_out))
        self.makefile_path = os.path.join(self.config_dir, 'Makefile_sync')
        if suffix:
            self.makefile_path += "_{}".format(suffix)
  def __init__(self,
        config_dir,
        output_dir,
        output_filename,
        version,
        era,
        channels,
        dry_run,
        check_output_files,
        running_method,
        isDebug,
        rle_select,
        with_mem,
        use_nonnominal,
        hlt_filter,
        tau_id_wp,
        tau_id,
        use_home,
        systematics_label,
        use_preselected,
        jet_cleaning,
        gen_matching,
        project_dir = os.path.join(os.getenv('CMSSW_BASE'), 'src', 'tthAnalysis', 'HiggsToTauTau'),
        file_pattern = 'tthAnalyzeRun_%s.py',
        suffix = '',
      ):

    self.running_method     = running_method
    self.dry_run            = dry_run
    self.check_output_files = check_output_files
    self.use_home           = use_home
    executable_pattern = os.path.join(project_dir, 'test', file_pattern)

    self.hadd_script_dir_path = os.path.join(config_dir, DKEY_SCRIPTS, DKEY_SYNC)
    self.hadd_log_dir_path    = os.path.join(config_dir, DKEY_LOGS,    DKEY_SYNC,)
    self.hadd_script_path         = os.path.join(self.hadd_script_dir_path, 'hadd_sync.py')
    self.hadd_log_wrapper_path    = os.path.join(self.hadd_log_dir_path,    'hadd_sync_wrapper.log')
    self.hadd_log_executable_path = os.path.join(self.hadd_log_dir_path,    'hadd_sync_executable.log')

    final_output_dir = os.path.join(output_dir, DKEY_SYNC)
    self.final_output_file = os.path.join(final_output_dir, output_filename)

    common_args = "-m %s -v %s -e %s -s %s -y %s " % \
      ('sync_wMEM' if with_mem else 'sync',  version, era, ' '.join(systematics_label), use_home)
    if jet_cleaning:
      common_args += " -q %s " % jet_cleaning
    if gen_matching:
      common_args += " -g %s " % gen_matching
    additional_args = " -E"
    if self.dry_run:
      additional_args += " -d"
    if check_output_files:
      additional_args += " -C"
    if isDebug:
      additional_args += " -D"
    if rle_select:
      additional_args += " -S '%s'" % rle_select
    if use_nonnominal:
      additional_args += " -O"
    if hlt_filter:
      additional_args += " -H"
    if tau_id:
      additional_args += " -t %s" % tau_id
    if tau_id_wp:
      additional_args += " -w %s" % tau_id_wp
    if self.running_method:
      additional_args += " -R %s" % self.running_method

    cr_channels = { channel : False for channel in [ '3l', '4l' ] }

    inclusive_args = '-v %s -e %s' % (version, era)

    inclusive_args += additional_args
    common_args    += additional_args

    create_if_not_exists(config_dir)
    create_if_not_exists(output_dir)

    channels_extended = collections.OrderedDict()
    cr_channels = [ '3l', '4l' ]
    for channel in channels:
      channels_extended[channel] = ''
      if channel in cr_channels:
        channels_extended[channel + 'ctrl'] = ' -c'

    self.channel_info = {}
    for channel in channels_extended:
      input_file = os.path.join(final_output_dir, '%s.root' % channel)
      executable_channel = channel
      if channel.replace('ctrl', '') in cr_channels:
        executable_channel = channel.replace('ctrl', '')
      channel_script = executable_pattern % executable_channel

      channel_makefile = os.path.join(config_dir, 'Makefile_%s' % channel)
      channel_outlog   = os.path.join(config_dir, 'stdout_sync_%s.log' % channel)
      channel_errlog   = os.path.join(config_dir, 'stderr_sync_%s.log' % channel)
      channel_outlog_create = os.path.join(config_dir, 'stdout_sync_create_%s.log' % channel)
      channel_errlog_create = os.path.join(config_dir, 'stderr_sync_create_%s.log' % channel)
      channel_outlog, channel_errlog, channel_outlog_create, channel_errlog_create = get_log_version((
        channel_outlog, channel_errlog, channel_outlog_create, channel_errlog_create
      ))

      cmd_args = common_args if 'inclusive' not in channel else inclusive_args
      if 'inclusive' not in channel:
        cmd_args += " -p %s" % use_preselected
      cmd_args += channels_extended[channel]

      channel_cmd_create = '%s %s 2>%s 1>%s' % \
                           (channel_script, cmd_args, channel_errlog_create, channel_outlog_create)
      channel_cmd_run   = '$(MAKE) -j 5 -f %s all 2>%s 1>%s' % (channel_makefile, channel_errlog, channel_outlog)
      channel_cmd_clean = '$(MAKE)      -f %s clean' % channel_makefile
      self.channel_info[input_file] = {
        'create' : channel_cmd_create,
        'run'    : channel_cmd_run,
        'clean'  : channel_cmd_clean,
      }

    self.stdout_file_path = os.path.join(config_dir, "stdout_sync.log")
    self.stderr_file_path = os.path.join(config_dir, "stderr_sync.log")
    self.sw_ver_file_cfg  = os.path.join(config_dir, "VERSION_sync.log")
    self.sw_ver_file_out  = os.path.join(output_dir, "VERSION_sync.log")
    self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out = get_log_version((
      self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out
    ))
    self.makefile_path = os.path.join(config_dir, 'Makefile_sync')
    if suffix:
      self.makefile_path += "_{}".format(suffix)