def test_train(test, train, y_test, y_train, labels=[], bins=25, node=0, plot_dir=None, weight_test=None, weight_train=None): ks = {} fig, ax = plt.subplots(1,1,figsize=(10,10)) h = {} for i, label in enumerate(labels): _ks, _p = scipy.stats.kstest( train[:,node][(y_train==i)], test[:,node][(y_test==i)] ) #_ks, _p = -1, -1 ks[label] = (_p, _ks) h[label+'_test'] = Hist1D(test[:,node][(y_test==i)], bins=bins, weights=weight_test[(y_test==i)]).normalize() h[label+'_train'] = Hist1D(train[:,node][(y_train==i)], bins=bins, label=label+' (p=%.2f, KS=%.2f)'%(_p, _ks), weights=weight_train[(y_train==i)]).normalize() h[label+'_test'].plot(color=colors[i], histtype="step", ls='--', linewidth=2) h[label+'_train'].plot(color=colors[i], histtype="step", linewidth=2) if plot_dir: finalizePlotDir(plot_dir) fig.savefig("{}/score_node_{}.png".format(plot_dir, node)) fig.savefig("{}/score_node_{}.pdf".format(plot_dir, node)) return ks
def test_sample(): h1 = Hist1D.from_random("norm", bins="10,-5,5", size=1e4) h2 = Hist1D(h1.sample(size=1e5), bins=h1.edges) # fitting the ratio of the two should give a horizontal line at y=1 ret = (h1.normalize() / h2.normalize()).fit("slope*x+offset") assert abs(ret["params"]["slope"]["value"]) < 0.05 assert abs(ret["params"]["offset"]["value"] - 1) < 0.01
def test_train_cat(test, train, y_test, y_train, labels=[], n_cat=5, plot_dir=None, weight_test=None, weight_train=None): ks = {} bins = [x-0.5 for x in range(n_cat+1)] fig, ax = plt.subplots(1,1,figsize=(10,10)) h = {} for i, label in enumerate(labels): _ks, _p = scipy.stats.kstest( train.argmax(axis=1)[(y_train==i)], test.argmax(axis=1)[(y_test==i)] ) #_ks, _p = -1, -1 ks[label] = (_p, _ks) h[label+'_test'] = Hist1D(test.argmax(axis=1)[(y_test==i)], bins=bins, weights=weight_test[(y_test==i)]).normalize() h[label+'_train'] = Hist1D(train.argmax(axis=1)[(y_train==i)], bins=bins, label=label+' (p=%.2f, KS=%.2f)'%(_p, _ks), weights=weight_train[(y_train==i)]).normalize() h[label+'_test'].plot(color=colors[i], histtype="step", ls='--', linewidth=2) h[label+'_train'].plot(color=colors[i], histtype="step", linewidth=2) ax.set_ylabel('a.u.') ax.set_xlabel('category') ax.set_ylim(0,1/n_cat*5) if plot_dir: finalizePlotDir(plot_dir) fig.savefig("{}/categories.png".format(plot_dir)) fig.savefig("{}/categories.pdf".format(plot_dir)) return ks
def test_weight_broadcasting(): v = [0.5, 0.5, 1.5, 1.5] h1 = Hist1D(v, bins="2,0,2") h2 = Hist1D(v, bins="2,0,2", weights=2) h3 = Hist1D(v, bins="2,0,2", weights=2 * np.ones_like(v)) assert h1 * 2 == h2 assert h1 * 2 == h3
def __init__(self, year): self.year = year if self.year == 2016: pass elif self.year == 2017: pass elif self.year == 2018: SF_file = os.path.expandvars( '$TWHOME/data/btag/DeepJet_102XSF_V2.csv') self.btag_sf = BTagScaleFactor(SF_file, "medium", keep_df=False) # and load the efficiencies self.effs = { 'b': Hist1D.from_json( os.path.expandvars( "$TWHOME/data/btag/Autumn18_b_eff_deepJet.json")), 'c': Hist1D.from_json( os.path.expandvars( "$TWHOME/data/btag/Autumn18_c_eff_deepJet.json")), 'light': Hist1D.from_json( os.path.expandvars( "$TWHOME/data/btag/Autumn18_light_eff_deepJet.json")), }
def test_likelihood(): h = Hist1D(np.arange(5) * 2, bins="10,0,10") f = "p0+0*x" ret_chi2 = h.fit(f, draw=False, likelihood=False) ret_like = h.fit(f, draw=False, likelihood=True) allclose(ret_chi2["params"]["p0"]["value"], 1.0) allclose(ret_like["params"]["p0"]["value"], 0.5) # all relative errors within <1% when counts are large h = Hist1D.from_random("norm", params=[0, 1], size=2000, random_state=42, bins="20,-3,3") ret_chi2 = h.fit("a*np.exp(-(x-mu)**2./(2*sigma**2.))", draw=False, likelihood=False) ret_like = h.fit("a*np.exp(-(x-mu)**2./(2*sigma**2.))", draw=False, likelihood=True) keys = ret_chi2["params"].keys() ret_chi2_errors = np.array( [ret_chi2["params"][key]["error"] for key in keys]) ret_like_errors = np.array( [ret_like["params"][key]["error"] for key in keys]) ret_chi2_values = np.array( [ret_chi2["params"][key]["value"] for key in keys]) v = (ret_chi2_errors - ret_like_errors) / ret_chi2_values assert (np.abs(v) < 0.01).mean() == 1.0
def test_binning(): v = np.arange(10) h1 = Hist1D(v, bins=np.linspace(0, 10, 11)) h2 = Hist1D(v, bins="10,0,10") h3 = Hist1D(v, bins=10, range=[0, 10]) assert h1 == h2 assert h2 == h3
def process_1tau_1lep(taus_all, muons_all, electrons_all, genWeight_all): out_hists = {} ntau = ak.num(taus_all) nmuon = ak.num(muons_all) nelectron = ak.num(electrons_all) mask_tau_e = (ntau == 1) & (nelectron == 1) & (nmuon == 0) mask_tau_mu = (ntau == 1) & (nmuon == 1) & (nelectron == 0) taus_taue = taus_all[mask_tau_e] electrons_taue = electrons_all[mask_tau_e] genWeight_taue = genWeight_all[mask_tau_e] taus_taumu = taus_all[mask_tau_mu] muons_taumu = muons_all[mask_tau_mu] genWeight_taumu = genWeight_all[mask_tau_mu] if len(taus_taue) > 0: tau0 = taus_taue[:, 0] ele0 = electrons_taue[:, 0] mtaue = np.sqrt( 2 * tau0.pt * ele0.pt * (np.cosh(tau0.eta - ele0.eta) - np.cos(tau0.phi - ele0.phi))) #mtaue = (taus_taue[:,0] + electrons_taue[:,0]).mass out_hists["mtaue"] = Hist1D(ak.to_numpy(mtaue), bins=bin_mZ, weights=ak.to_numpy(genWeight_taue), label="mtaue") dR_tau_e = deltaR_devfunc(taus_taue, electrons_taue) out_hists["dR_tau_e"] = Hist1D(ak.to_numpy(dR_tau_e), bins=bin_dR, weights=ak.to_numpy(genWeight_taue), label="dR_taue") if len(taus_taumu) > 0: tau0 = taus_taumu[:, 0] mu0 = muons_taumu[:, 0] mtaumu = np.sqrt( 2 * tau0.pt * mu0.pt * (np.cosh(tau0.eta - mu0.eta) - np.cos(tau0.phi - mu0.phi))) #mtaumu = (taus_taumu[:,0] + muons_taumu[:,0]).mass out_hists["mtaumu"] = Hist1D(ak.to_numpy(mtaumu), bins=bin_mZ, weights=ak.to_numpy(genWeight_taumu), label="mtaumu") dR_tau_mu = deltaR_devfunc(taus_taumu, muons_taumu) out_hists["dR_tau_mu"] = Hist1D(ak.to_numpy(dR_tau_mu), bins=bin_dR, weights=ak.to_numpy(genWeight_taumu), label="dR_taumu") return out_hists
def test_gaus_extra(): np.random.seed(42) bins = "50,-5,5" mean = 1.0 sigma = 0.5 h = Hist1D(np.random.normal(mean, sigma, 350), bins=bins) + Hist1D( 10 * np.random.random(600) - 5, bins=bins) params = h.fit("offset+gaus", draw=False)["params"] assert abs(params["mean"]["value"] - mean) / mean < 0.1 assert abs(params["sigma"]["value"] - sigma) / sigma < 0.2
def test_sum(): h1 = Hist1D([0.5], bins=[0.0, 1]) h2 = Hist1D([0.5], bins=[0.0, 1]) assert sum([h1, h2]) == (h1 + h2) h1 = Hist1D.from_bincounts([2]) h2 = Hist1D.from_bincounts([3]) h3 = Hist1D.from_bincounts([4]) assert (h1 + h2 + h3).integral == 9 assert sum([h1, h2, h3]).integral == 9
def make_plots(qArray, qArraySimTrackMatched, quantity, layerType): if len(qArray) == 0 or len(qArraySimTrackMatched) == 0: print("{} has no entries. Skipping".format(layerType)) return minValue = min(qArray[qArray > -999]) maxValue = max(qArray) histMinLimit = 1.1 * minValue if minValue < 0 else 0.9 * minValue histMaxLimit = 1.1 * maxValue if maxValue > 0 else 0.9 * maxValue binning = np.linspace(histMinLimit, histMaxLimit, 1000) allHist = Hist1D(ak.to_numpy(qArray[qArray > -999]), bins=binning, label="{}".format(quantity)) simtrackMatchedHist = Hist1D(ak.to_numpy( qArraySimTrackMatched[qArraySimTrackMatched > -999]), bins=binning, label="Sim track matched {}".format(quantity)) fig, ax = plt.subplots() ax.set_yscale("log") allHist.plot(alpha=0.8, color="C0", label="all", histtype="stepfilled") simtrackMatchedHist.plot(alpha=0.8, color="C3", label="sim track matched", histtype="stepfilled") if layerType == "": if "TripletPt" in quantity: title = quantity.replace("TripletPt", "Triplet radius") else: title = quantity plt.title("{}".format(title)) else: plt.title("{} type {}".format(quantity, layerType)) plt.suptitle("Sample = {} Tag = {}".format(sys.argv[3], sys.argv[4])) # extra job for the composite dudes quantity = quantity.replace("(", " ") quantity = quantity.replace(")", "") quantity = quantity.replace("/", "by") quantity = quantity.replace("-", "minus") quantity = quantity.replace(" ", "_") if quantity[0] == "_": quantity = quantity[1:] if len(sys.argv) > 2: if layerType != "": plt.savefig("{}/{}_{}.pdf".format(sys.argv[2], quantity, layerType)) else: plt.savefig("{}/{}.pdf".format(sys.argv[2], quantity)) else: if layerType != "": plt.savefig("{}_{}.pdf".format(quantity, layerType)) else: plt.savefig("{}.pdf".format(quantity)) plt.close()
def process_diphoton(photons, gHidx, mgg, genWeight): out_hists = {} out_hists["pho_pT1"] = Hist1D(ak.to_numpy( getFromIdx(photons.pt, gHidx[:, 0])), bins=bin_pt1, weights=ak.to_numpy(genWeight), label="pho1 pt") out_hists["pho_pT2"] = Hist1D(ak.to_numpy( getFromIdx(photons.pt, gHidx[:, 1])), bins=bin_pt1, weights=ak.to_numpy(genWeight), label="pho2 pt") out_hists["pho_pTom1"] = Hist1D(ak.to_numpy( getFromIdx(photons.pt, gHidx[:, 0]) / mgg), bins=bin_ptom, weights=ak.to_numpy(genWeight), label="pho1 ptom") out_hists["pho_pTom2"] = Hist1D(ak.to_numpy( getFromIdx(photons.pt, gHidx[:, 1]) / mgg), bins=bin_ptom, weights=ak.to_numpy(genWeight), label="pho2 ptom") out_hists["pho_eta1"] = Hist1D(ak.to_numpy( getFromIdx(photons.eta, gHidx[:, 0])), bins=bin_eta, weights=ak.to_numpy(genWeight), label="pho1 eta") out_hists["pho_eta2"] = Hist1D(ak.to_numpy( getFromIdx(photons.eta, gHidx[:, 1])), bins=bin_eta, weights=ak.to_numpy(genWeight), label="pho2 eta") out_hists["pho_phi1"] = Hist1D(ak.to_numpy( getFromIdx(photons.phi, gHidx[:, 0])), bins=bin_phi, weights=ak.to_numpy(genWeight), label="pho1 phi") out_hists["pho_phi2"] = Hist1D(ak.to_numpy( getFromIdx(photons.phi, gHidx[:, 1])), bins=bin_phi, weights=ak.to_numpy(genWeight), label="pho2 phi") out_hists["pho_id1"] = Hist1D(ak.to_numpy( getFromIdx(photons.mvaID, gHidx[:, 0])), bins=bin_bdt, weights=ak.to_numpy(genWeight), label="pho1 id") out_hists["pho_id2"] = Hist1D(ak.to_numpy( getFromIdx(photons.mvaID, gHidx[:, 1])), bins=bin_bdt, weights=ak.to_numpy(genWeight), label="pho2 id") ## need a function to make p4 of diphoton return out_hists
def test_statistics(): v = [0.5, 0.5, 1.5, 1.5] bins = [0.0, 1.0, 2.0] h = Hist1D(v, bins=bins) assert h.mean() == 1.0 assert h.std() == 0.5 assert h.mode() == 0.5 v = [0.5, 1.5, 1.5] bins = [0.0, 1.0, 2.0] h = Hist1D(v, bins=bins) assert h.mode() == 1.5
def test_overflow(): v = np.arange(10) bins = "8,0.5,8.5" h = Hist1D(v, bins=bins, overflow=True) assert h.counts[0] == 2 assert h.counts[-1] == 2 assert h.integral == 10 h = Hist1D(v, bins=bins, overflow=False) assert h.counts[0] == 1 assert h.counts[-1] == 1 assert h.integral == 8
def make_plots(qArray, qArraySimTrackMatched, quantity, layerType): minValue = min(qArray[qArray > -999]) maxValue = max(qArray) histMinLimit = minValue * 1.1 if minValue < 0 else minValue * 0.9 histMaxLimit = maxValue * 0.9 if maxValue < 0 else maxValue * 1.1 if abs(histMaxLimit - histMinLimit) > 10 and histMinLimit > 0 or "/" in quantity: binning = np.logspace(np.log10(histMinLimit), np.log10(histMaxLimit), 1000) else: binning = np.linspace(histMinLimit, histMaxLimit, 1000) allHist = Hist1D(ak.to_numpy(qArray[qArray > -999]), bins=binning, label="{}".format(quantity)) simtrackMatchedHist = Hist1D(ak.to_numpy( qArraySimTrackMatched[qArraySimTrackMatched > -999]), bins=binning, label="Sim track matched {}".format(quantity)) fig = plt.figure() plt.yscale("log") if abs(histMaxLimit - histMinLimit) > 10 and histMinLimit > 0 or "/" in quantity: plt.xscale("log") allHist.plot(alpha=0.8, color="C0", label="all") simtrackMatchedHist.plot(alpha=0.8, color="C3", label="sim track matched") if layerType == "": plt.title("{}".format(quantity)) else: plt.title("{} type {}".format(quantity, layerType)) plt.suptitle("Sample = {} Tag = {}".format(sys.argv[3], sys.argv[4])) #extra job for the composite dudes quantity = quantity.replace("(", " ") quantity = quantity.replace(")", "") quantity = quantity.replace("/", "by") quantity = quantity.replace("-", "minus") quantity = quantity.replace(" ", "_") if len(sys.argv) > 2: if layerType != "": plt.savefig("{}/{}_{}.pdf".format(sys.argv[2], quantity, layerType)) else: plt.savefig("{}/{}.pdf".format(sys.argv[2], quantity)) else: if layerType != "": plt.savefig("{}_{}.pdf".format(quantity, layerType)) else: plt.savefig("{}.pdf".format(quantity)) plt.close()
def test_datetime(): pd = pytest.importorskip("pandas") df = pd.DataFrame() df["date"] = pd.date_range("2019-01-01", "2020-01-10", freq="1h") df["num"] = np.random.normal(0, 1, len(df)) bins = pd.date_range(pd.Timestamp("2019-01-01"), pd.Timestamp("2020-01-10"), periods=20) h1 = Hist1D(df["date"]) h2 = Hist1D(df["date"], bins=10) h3 = Hist1D(df["date"], bins=bins) for h in [h1, h2, h3]: assert len(df) == h.integral
def test_lookup(): h = Hist1D.from_random(size=50, bins="7,-3,3", random_state=42) allclose(h.lookup(h.bin_centers), h.counts) assert h.lookup([-10.0]) == h.counts[0] assert h.lookup([10.0]) == h.counts[-1] assert h.lookup(-10.0) == h.counts[0] assert h.lookup(10.0) == h.counts[-1]
def test_against_root(): """ import ROOT as r h1 = r.TH1F("h1","",10,-1.01,1.01) for x in [-0.1, 0.1, -0.2, 0.1, 0.1, 0.4, 0.1]: h1.Fill(x) for likelihood in ["", "L"]: res = h1.Fit("gaus", f"QS{likelihood}").Get() print(list(res.Parameters()), list(res.Errors())) """ h = Hist1D([-0.1, 0.1, -0.2, 0.1, 0.1, 0.4, 0.1], bins="10,-1.01,1.01") res = h.fit("gaus", likelihood=False) params = res["params"] assert abs(params["constant"]["value"] - 4.1175) < 1e-3 assert abs(params["mean"]["value"] - 0.0673) < 1e-3 assert abs(params["sigma"]["value"] - 0.1401) < 1e-3 assert abs(params["constant"]["error"] - 2.0420) < 1e-3 assert abs(params["mean"]["error"] - 0.0584) < 1e-3 assert abs(params["sigma"]["error"] - 0.0531) < 1e-3 res = h.fit("gaus", likelihood=True) params = res["params"] assert abs(params["constant"]["value"] - 4.3562) < 1e-3 assert abs(params["mean"]["value"] - 0.07190) < 1e-3 assert abs(params["sigma"]["value"] - 0.1294) < 1e-3 assert abs(params["constant"]["error"] - 2.0008) < 2e-2 assert abs(params["mean"]["error"] - 0.04908) < 1e-3 assert abs(params["sigma"]["error"] - 0.0339) < 1e-3
def test_extent(): h_full = Hist1D.from_random( "uniform", params=[-1, 1], size=1e3, bins="100,-1,1") + Hist1D.from_random( "uniform", params=[0, +1], size=2e3, bins="100,-1,1") h_restricted = h_full.restrict(0, 1) fit_full = h_full.fit("a+b*x", extent=[0, 1], color="C0", draw=False) fit_restricted = h_restricted.fit("a+b*x", color="C3", draw=False) # fitted values should be the same since the domain is the same assert fit_full["params"] == fit_restricted["params"] # check that the histograms of the fit match the input domain assert h_restricted._check_consistency(fit_restricted["hfit"]) assert h_full._check_consistency(fit_full["hfit"])
def test_weighted(): v = np.array([0.5, 0.5, 1.5, 1.5]) w = np.array([1.0, 1.0, 2.0, 2.0]) bins = np.array([0.0, 1.0, 2.0]) h = Hist1D(v, bins=bins, weights=w) allclose(h.counts, np.array([2.0, 4.0])) allclose(h.errors, np.array([2.0**0.5, 8**0.5]))
def test_metadata(): assert Hist1D().metadata == {} assert Hist1D(label="test").metadata == {"label": "test"} assert Hist1D(color="C0").metadata == {"color": "C0"} assert Hist1D(color="C0", metadata={ "foo": "bar" }).metadata == { "color": "C0", "foo": "bar", } assert Hist1D(metadata={ "color": "C0", "foo": "bar" }).metadata == { "color": "C0", "foo": "bar", }
def test_frombincounts(): np.random.seed(42) v = np.random.random(100) bins = np.linspace(0, 1, 11) h1 = Hist1D(v, bins=bins) counts, _ = np.histogram(v, bins=bins) h2 = Hist1D.from_bincounts(counts=counts, bins=bins) assert h1 == h2 h3 = Hist1D.from_bincounts([1, 2]) assert h3.nbins == 2 assert h3.integral == 3.0 h = Hist1D.from_bincounts([1, 1, 2], [-1.5, -0.5, 0.5, 1.5], label="test1", color="red") allclose(h.counts, [1.0, 1.0, 2.0]) assert h.metadata["label"] == "test1"
def process_event(weights, evt_vars): #, bjets): out_hists = {} out_hists["MET"] = Hist1D(ak.to_numpy(evt_vars.MET_pt), bins=bin_met, label="MET", weights=ak.to_numpy(genWeight)) #out_hists["njets"] = Hist1D(ak.to_numpy(evt_vars.nJet), bins = bin_njet, label="njet", weights = ak.to_numpy(genWeight)) #out_hists["nbjets"] = Hist1D(ak.num(bjets), bins = bin_bjet) out_hists["weight"] = ak.sum(weights) return out_hists
def test_fill(): h = Hist1D(bins="10,0,10", label="test") h.fill([1, 2, 3, 4]) h.fill([0, 1, 2]) h.median() assert h.lookup(0) == 1.0 assert h.lookup(1) == 2.0 assert h.lookup(3) == 1.0 assert h.lookup(5) == 0.0 assert h.metadata["label"] == "test"
def test_return_func(): h = Hist1D.from_random("uniform") fit = h.fit("a+b*x") func = fit["func"] hfit = fit["hfit"] xs = h.bin_centers ys = func(xs) allclose(ys, hfit.counts)
def test_json(): h1 = Hist1D([0.5], bins=np.arange(1000), label="foo") h2 = Hist1D.from_json(h1.to_json()) assert h1 == h2 assert h1.metadata == h2.metadata h1.to_json(".tmphist1d.json") h2 = Hist1D.from_json(".tmphist1d.json") assert h1 == h2 assert h1.metadata == h2.metadata h1.to_json(".tmphist1d.json.gz") h2 = Hist1D.from_json(".tmphist1d.json.gz") assert h1 == h2 assert h1.metadata == h2.metadata assert os.path.getsize(".tmphist1d.json.gz") < os.path.getsize( ".tmphist1d.json")
def test_linear_fit(): # fit a line to 2,2,2,2,2 h = Hist1D(np.arange(10) + 0.5, bins="5,0,10") result = h.fit("a+b*x", draw=False) assert np.abs(result["params"]["a"]["value"] - 2) < 1e-3 assert np.abs(result["params"]["b"]["value"] - 0) < 1e-3 assert result["params"]["a"]["error"] > 0.5 assert result["chi2"] < 1e-3 assert result["ndof"] == 3
def hist_from_branch(fname, branch): from yahist import Hist1D f = uproot4.open("root://xcache-redirector.t2.ucsd.edu:2042/" + fname) t = f["Events"] arr = t[branch].array(entry_stop=500) try: arr = np.array(ak.flatten(arr)) except: pass h = Hist1D(arr) return h
def test_arithmetic(): def check_count_error(h, count, error): assert h.counts[0] == count assert h.errors[0] == error h = Hist1D([0.5], bins=[0.0, 1]) check_count_error(h + h, 2.0, 2.0**0.5) check_count_error(2.0 * h, 2.0, 2.0) check_count_error(h / 2.0, 0.5, 0.5) check_count_error(h - h, 0.0, 2.0**0.5) check_count_error(h / h, 1.0, 2.0**0.5)
def test_basic(): v = np.array([0.5, 0.5, 1.5, 1.5]) bins = np.array([0.0, 1.0, 2.0]) h = Hist1D(v, bins=bins) assert h.dim == 1 a = np.array([2.0, 2.0]) allclose(h.counts, a) allclose(h.errors, a**0.5) allclose(h.edges, bins) assert h.nbins == len(bins) - 1 allclose(h.bin_widths, np.array([1.0, 1.0])) allclose(h.bin_centers, np.array([0.5, 1.5]))