def load_lists():
    files = [
        bucoffea_path(f"data/datasets/datasets_2017.txt"),
        bucoffea_path(f"data/datasets/datasets_2018.txt")
    ]
    lines = []
    for fpath in files:
        with open(fpath,"r") as f:
            lines.extend(f.readlines())

    lines = filter(lambda x: "NANOAOD" in x and not x.startswith("#"), lines)
    return lines
예제 #2
0
    def _configure(self, df):
        dataset = df['dataset']
        self._year = extract_year(dataset)

        # Reload config based on year
        cfg.DYNACONF_WORKS = "merge_configs"
        cfg.MERGE_ENABLED_FOR_DYNACONF = True
        cfg.SETTINGS_FILE_FOR_DYNACONF = bucoffea_path("config/monojet.yaml")
        cfg.ENV_FOR_DYNACONF = f"era{self._year}"
        cfg.reload()
예제 #3
0
    def _configure(self, df=None):
        cfg.DYNACONF_WORKS = "merge_configs"
        cfg.MERGE_ENABLED_FOR_DYNACONF = True
        cfg.SETTINGS_FILE_FOR_DYNACONF = bucoffea_path("config/vbfhinv.yaml")

        # Reload config based on year
        if df:
            dataset = df['dataset']
            self._year = extract_year(dataset)
            cfg.ENV_FOR_DYNACONF = f"era{self._year}"
        else:
            cfg.ENV_FOR_DYNACONF = f"default"
        cfg.reload()
예제 #4
0
def files_from_ac(regex):
    """Generate file list per dataset from T2_DE_RWTH

    :param regex: Regular expression to match datasets
    :type regex: string
    :return: Mapping of dataset : [files]
    :rtype: dict
    """
    path = bucoffea_path('data/datasets/crabfiles.yml')

    with open(path, 'r') as stream:
        try:
            fileset = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    for dataset, files in fileset.items():
        if not re.match(regex, dataset):
            continue
        for ifile in reversed(files):
            if not len(ifile):
                files.remove(ifile)
        fileset[dataset] = files
    return fileset
    def process(self, df):
        self._configure(df)
        output = self.accumulator.identity()
        dataset = df['dataset']

        # Lumi mask
        year = extract_year(dataset)
        if is_data(dataset):
            if year == 2016:
                json = bucoffea_path(
                    'data/json/Cert_271036-284044_13TeV_ReReco_07Aug2017_Collisions16_JSON.txt'
                )
            elif year == 2017:
                json = bucoffea_path(
                    'data/json/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON_v1.txt'
                )
            elif year == 2018:
                json = bucoffea_path(
                    'data/json/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt'
                )
            lumi_mask = LumiMask(json)(df['run'], df['luminosityBlock'])
        else:
            lumi_mask = np.ones(df.size) == 1

        # MET filters
        if is_data(dataset):
            filt_met = mask_and(df, cfg.FILTERS.DATA)
        else:
            filt_met = mask_and(df, cfg.FILTERS.MC)

        if year == 2016:
            trigger = 'HLT_Photon175'
        else:
            trigger = 'HLT_Photon200'

        photons = setup_photons(df)

        ak4 = setup_jets(df)
        ak4 = ak4[
                  object_overlap(ak4, photons) \
                  & ak4.tightId \
                  & (ak4.pt > 100) \
                  & (ak4.abseta < 2.4)
                  ]

        event_mask = filt_met \
                     & lumi_mask \
                     & (ak4.counts > 0) \
                     & df[trigger] \
                     & (df['MET_pt'] < 60)

        # Generator weight
        weights = processor.Weights(size=df.size, storeIndividual=True)

        if is_data(dataset):
            weights.add('gen', np.ones(df.size))
        else:
            weights.add('gen', df['Generator_weight'])

        photon_kinematics = (photons.pt > 200) & (photons.barrel)

        # Medium
        vals = photons[photon_kinematics & photons.mediumId].sieie[event_mask]
        pt = photons[photon_kinematics & photons.mediumId].pt[event_mask]
        output['sieie'].fill(dataset=dataset,
                             cat='medium',
                             sieie=vals.flatten(),
                             pt=pt.flatten(),
                             weights=weight_shape(
                                 vals,
                                 weights.weight()[event_mask]))

        # No Sieie
        vals = photons[photon_kinematics
                       & medium_id_no_sieie(photons)].sieie[event_mask]
        pt = photons[photon_kinematics
                     & medium_id_no_sieie(photons)].pt[event_mask]
        output['sieie'].fill(dataset=dataset,
                             cat='medium_nosieie',
                             sieie=vals.flatten(),
                             pt=pt.flatten(),
                             weights=weight_shape(
                                 vals,
                                 weights.weight()[event_mask]))

        # No Sieie, inverted isolation
        vals = photons[photon_kinematics
                       & medium_id_no_sieie_inv_iso(photons)].sieie[event_mask]
        pt = photons[photon_kinematics
                     & medium_id_no_sieie_inv_iso(photons)].pt[event_mask]
        output['sieie'].fill(dataset=dataset,
                             cat='medium_nosieie_invertiso',
                             sieie=vals.flatten(),
                             pt=pt.flatten(),
                             weights=weight_shape(
                                 vals,
                                 weights.weight()[event_mask]))

        # Keep track of weight sum
        if not is_data(dataset):
            output['sumw'][dataset] += df['genEventSumw']
            output['sumw2'][dataset] += df['genEventSumw2']
        return output
예제 #6
0
def do_submit(args):
    """Submit the analysis to HTCondor."""
    import htcondor

    if args.datasrc == 'das':
        dataset_files = files_from_das(regex=args.dataset)
    elif args.datasrc == 'ac':
        dataset_files = files_from_ac(regex=args.dataset)
    else:
        dataset_files = files_from_eos(regex=args.dataset)

    # Test mode: One file per data set
    if args.test:
        tmp = {}
        for k, v in dataset_files.items():
            tmp[k] = v[:1]
        dataset_files = tmp

    # Time tagged submission directory
    timetag = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    foldername = timetag + (f"_{args.name}" if args.name else "")
    subdir = os.path.abspath(pjoin("./submission/", foldername))
    if not os.path.exists(subdir):
        os.makedirs(subdir)

    # Sub-directory to store submission files
    filedir = 'files'
    if not os.path.exists(pjoin(subdir, filedir)):
        os.makedirs(pjoin(subdir, filedir))

    # Get proxy and copy to a safe location on AFS
    proxy = vo_proxy_path()
    proxydir = os.path.expanduser("~/.voms/")
    if not os.path.exists(proxydir):
        os.makedirs(proxydir)
    shutil.copy2(proxy, proxydir)

    for dataset, files in dataset_files.items():
        print(f"Submitting dataset: {dataset}.")

        chunks = chunkify(files, int(len(files) / args.filesperjob + 1))
        for ichunk, chunk in enumerate(chunks):
            # Save input files to a txt file and send to job
            tmpfile = pjoin(
                subdir, filedir,
                f"input_{dataset}_{ichunk:03d}of{len(chunks):03d}.txt")
            with open(tmpfile, "w") as f:
                for file in chunk:
                    f.write(f"{file}\n")

            arguments = [
                # pjoin(proxydir, os.path.basename(proxy)),
                "$(Proxy_path)",
                str(Path(__file__).absolute()),
                args.processor,
                f'--outpath {pjoin(subdir, "output")}',
                f'--jobs {args.jobs}',
                'worker',
                f'--dataset {dataset}',
                f'--filelist {os.path.basename(tmpfile)}',
                f'--chunk {ichunk}'
            ]
            input_files = [
                os.path.abspath(tmpfile),
            ]
            environment = {"NOPREFETCH": str(args.no_prefetch).lower()}
            sub = htcondor.Submit({
                "Proxy_path":
                pjoin(proxydir, os.path.basename(proxy)),
                "Initialdir":
                subdir,
                "executable":
                bucoffea_path("execute/htcondor_wrap.sh"),
                "should_transfer_files":
                "YES",
                "when_to_transfer_output":
                "ON_EXIT",
                "transfer_input_files":
                ", ".join(input_files),
                "getenv":
                "true",
                "environment":
                '"' + ' '.join([f"{k}={v}"
                                for k, v in environment.items()]) + '"',
                "arguments":
                " ".join(arguments),
                "Output":
                f"{filedir}/out_{dataset}_{ichunk:03d}of{len(chunks):03d}.txt",
                "Error":
                f"{filedir}/err_{dataset}_{ichunk:03d}of{len(chunks):03d}.txt",
                "log":
                f"{filedir}/log_{dataset}_{ichunk:03d}of{len(chunks):03d}.txt",
                # "log" :f"/dev/null",
                "request_cpus":
                str(args.jobs),
                "+MaxRuntime":
                f"{60*60*8}"
            })

            jdl = pjoin(subdir, filedir, f'job_{dataset}_{ichunk}.jdl')
            with open(jdl, "w") as f:
                f.write(str(sub))
                f.write("\nqueue 1\n")
            if args.dry:
                jobid = -1
            else:
                jobid = condor_submit(jdl)
            print(f"Submitted job {jobid}")
            with open("submission_history.txt", "a") as f:
                f.write(f"{datetime.now()} {jobid}\n")