def load_lists(): files = [ bucoffea_path(f"data/datasets/datasets_2017.txt"), bucoffea_path(f"data/datasets/datasets_2018.txt") ] lines = [] for fpath in files: with open(fpath,"r") as f: lines.extend(f.readlines()) lines = filter(lambda x: "NANOAOD" in x and not x.startswith("#"), lines) return lines
def _configure(self, df): dataset = df['dataset'] self._year = extract_year(dataset) # Reload config based on year cfg.DYNACONF_WORKS = "merge_configs" cfg.MERGE_ENABLED_FOR_DYNACONF = True cfg.SETTINGS_FILE_FOR_DYNACONF = bucoffea_path("config/monojet.yaml") cfg.ENV_FOR_DYNACONF = f"era{self._year}" cfg.reload()
def _configure(self, df=None): cfg.DYNACONF_WORKS = "merge_configs" cfg.MERGE_ENABLED_FOR_DYNACONF = True cfg.SETTINGS_FILE_FOR_DYNACONF = bucoffea_path("config/vbfhinv.yaml") # Reload config based on year if df: dataset = df['dataset'] self._year = extract_year(dataset) cfg.ENV_FOR_DYNACONF = f"era{self._year}" else: cfg.ENV_FOR_DYNACONF = f"default" cfg.reload()
def files_from_ac(regex): """Generate file list per dataset from T2_DE_RWTH :param regex: Regular expression to match datasets :type regex: string :return: Mapping of dataset : [files] :rtype: dict """ path = bucoffea_path('data/datasets/crabfiles.yml') with open(path, 'r') as stream: try: fileset = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) for dataset, files in fileset.items(): if not re.match(regex, dataset): continue for ifile in reversed(files): if not len(ifile): files.remove(ifile) fileset[dataset] = files return fileset
def process(self, df): self._configure(df) output = self.accumulator.identity() dataset = df['dataset'] # Lumi mask year = extract_year(dataset) if is_data(dataset): if year == 2016: json = bucoffea_path( 'data/json/Cert_271036-284044_13TeV_ReReco_07Aug2017_Collisions16_JSON.txt' ) elif year == 2017: json = bucoffea_path( 'data/json/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON_v1.txt' ) elif year == 2018: json = bucoffea_path( 'data/json/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt' ) lumi_mask = LumiMask(json)(df['run'], df['luminosityBlock']) else: lumi_mask = np.ones(df.size) == 1 # MET filters if is_data(dataset): filt_met = mask_and(df, cfg.FILTERS.DATA) else: filt_met = mask_and(df, cfg.FILTERS.MC) if year == 2016: trigger = 'HLT_Photon175' else: trigger = 'HLT_Photon200' photons = setup_photons(df) ak4 = setup_jets(df) ak4 = ak4[ object_overlap(ak4, photons) \ & ak4.tightId \ & (ak4.pt > 100) \ & (ak4.abseta < 2.4) ] event_mask = filt_met \ & lumi_mask \ & (ak4.counts > 0) \ & df[trigger] \ & (df['MET_pt'] < 60) # Generator weight weights = processor.Weights(size=df.size, storeIndividual=True) if is_data(dataset): weights.add('gen', np.ones(df.size)) else: weights.add('gen', df['Generator_weight']) photon_kinematics = (photons.pt > 200) & (photons.barrel) # Medium vals = photons[photon_kinematics & photons.mediumId].sieie[event_mask] pt = photons[photon_kinematics & photons.mediumId].pt[event_mask] output['sieie'].fill(dataset=dataset, cat='medium', sieie=vals.flatten(), pt=pt.flatten(), weights=weight_shape( vals, weights.weight()[event_mask])) # No Sieie vals = photons[photon_kinematics & medium_id_no_sieie(photons)].sieie[event_mask] pt = photons[photon_kinematics & medium_id_no_sieie(photons)].pt[event_mask] output['sieie'].fill(dataset=dataset, cat='medium_nosieie', sieie=vals.flatten(), pt=pt.flatten(), weights=weight_shape( vals, weights.weight()[event_mask])) # No Sieie, inverted isolation vals = photons[photon_kinematics & medium_id_no_sieie_inv_iso(photons)].sieie[event_mask] pt = photons[photon_kinematics & medium_id_no_sieie_inv_iso(photons)].pt[event_mask] output['sieie'].fill(dataset=dataset, cat='medium_nosieie_invertiso', sieie=vals.flatten(), pt=pt.flatten(), weights=weight_shape( vals, weights.weight()[event_mask])) # Keep track of weight sum if not is_data(dataset): output['sumw'][dataset] += df['genEventSumw'] output['sumw2'][dataset] += df['genEventSumw2'] return output
def do_submit(args): """Submit the analysis to HTCondor.""" import htcondor if args.datasrc == 'das': dataset_files = files_from_das(regex=args.dataset) elif args.datasrc == 'ac': dataset_files = files_from_ac(regex=args.dataset) else: dataset_files = files_from_eos(regex=args.dataset) # Test mode: One file per data set if args.test: tmp = {} for k, v in dataset_files.items(): tmp[k] = v[:1] dataset_files = tmp # Time tagged submission directory timetag = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') foldername = timetag + (f"_{args.name}" if args.name else "") subdir = os.path.abspath(pjoin("./submission/", foldername)) if not os.path.exists(subdir): os.makedirs(subdir) # Sub-directory to store submission files filedir = 'files' if not os.path.exists(pjoin(subdir, filedir)): os.makedirs(pjoin(subdir, filedir)) # Get proxy and copy to a safe location on AFS proxy = vo_proxy_path() proxydir = os.path.expanduser("~/.voms/") if not os.path.exists(proxydir): os.makedirs(proxydir) shutil.copy2(proxy, proxydir) for dataset, files in dataset_files.items(): print(f"Submitting dataset: {dataset}.") chunks = chunkify(files, int(len(files) / args.filesperjob + 1)) for ichunk, chunk in enumerate(chunks): # Save input files to a txt file and send to job tmpfile = pjoin( subdir, filedir, f"input_{dataset}_{ichunk:03d}of{len(chunks):03d}.txt") with open(tmpfile, "w") as f: for file in chunk: f.write(f"{file}\n") arguments = [ # pjoin(proxydir, os.path.basename(proxy)), "$(Proxy_path)", str(Path(__file__).absolute()), args.processor, f'--outpath {pjoin(subdir, "output")}', f'--jobs {args.jobs}', 'worker', f'--dataset {dataset}', f'--filelist {os.path.basename(tmpfile)}', f'--chunk {ichunk}' ] input_files = [ os.path.abspath(tmpfile), ] environment = {"NOPREFETCH": str(args.no_prefetch).lower()} sub = htcondor.Submit({ "Proxy_path": pjoin(proxydir, os.path.basename(proxy)), "Initialdir": subdir, "executable": bucoffea_path("execute/htcondor_wrap.sh"), "should_transfer_files": "YES", "when_to_transfer_output": "ON_EXIT", "transfer_input_files": ", ".join(input_files), "getenv": "true", "environment": '"' + ' '.join([f"{k}={v}" for k, v in environment.items()]) + '"', "arguments": " ".join(arguments), "Output": f"{filedir}/out_{dataset}_{ichunk:03d}of{len(chunks):03d}.txt", "Error": f"{filedir}/err_{dataset}_{ichunk:03d}of{len(chunks):03d}.txt", "log": f"{filedir}/log_{dataset}_{ichunk:03d}of{len(chunks):03d}.txt", # "log" :f"/dev/null", "request_cpus": str(args.jobs), "+MaxRuntime": f"{60*60*8}" }) jdl = pjoin(subdir, filedir, f'job_{dataset}_{ichunk}.jdl') with open(jdl, "w") as f: f.write(str(sub)) f.write("\nqueue 1\n") if args.dry: jobid = -1 else: jobid = condor_submit(jdl) print(f"Submitted job {jobid}") with open("submission_history.txt", "a") as f: f.write(f"{datetime.now()} {jobid}\n")