def generate_initial_params(hgg_bg, hgg_signal, n_sigma): '''Input bg and signal dataframes, and a sigma value for signal injection. Output parameters for the pdfs that describe those distributions.''' # grab a handful of bg events, and an ~X sigma number of signal events hgg_bg_selection = hgg_bg[(hgg_bg.Mgg > 100) & (hgg_bg.Mgg < 180)][0:10000].Mgg n_bg_under_sig = hgg_bg_selection[(118 < hgg_bg_selection) & (hgg_bg_selection < 133)].size n_sig = int(n_sigma*np.sqrt(n_bg_under_sig)) hgg_signal_selection = hgg_signal[(hgg_signal.Mgg >= 118) & (hgg_signal.Mgg <= 133)][0:n_sig].Mgg data_bg = hgg_bg_selection.values data_sig = hgg_signal_selection.values # fit to the data distributions bg_model = ff.Model(bg_pdf, ['a1', 'a2', 'a3']) bg_model.set_bounds([(-1., 1.), (-1., 1.), (-1., 1.)]) bg_fitter = ff.NLLFitter(bg_model, data_bg) bg_result = bg_fitter.fit([0.0, 0.0, 0.0]) sig_model = ff.Model(sig_pdf, ['mu', 'sigma']) sig_model.set_bounds([(110, 130), (1, 5)]) sig_fitter = ff.NLLFitter(sig_model, data_sig) sig_result = sig_fitter.fit([120.0, 2]) n_bg = len(data_bg) be_bg = bayesian_blocks(data_bg, p0=0.02) be_sig = bayesian_blocks(data_sig, p0=0.02) return bg_result, sig_result, n_bg, n_sig, be_bg, be_sig
def generate_initial_params(hgg_bg, hgg_signal, n_sigma): '''Input bg and signal dataframes, and a sigma value for signal injection. Output parameters for the pdfs that describe those distributions.''' # grab a handful of bg events, and an ~X sigma number of signal events hgg_bg_selection = hgg_bg[(hgg_bg.Mgg > 100) & (hgg_bg.Mgg < 180)][0:10000].Mgg n_bg_under_sig = hgg_bg_selection[(118 < hgg_bg_selection) & (hgg_bg_selection < 133)].size n_sig = int(n_sigma * np.sqrt(n_bg_under_sig)) hgg_signal_selection = hgg_signal[(hgg_signal.Mgg >= 118) & (hgg_signal.Mgg <= 133)][0:n_sig].Mgg data_bg = hgg_bg_selection.values data_sig = hgg_signal_selection.values # fit to the data distributions bg_params = Parameters() bg_params.add_many(('a1', 0., True, -1, 1, None, None), ('a2', 0., True, -1, 1, None, None), ('a3', 0., True, -1, 1, None, None)) bg_model = Model(bg_pdf, bg_params) bg_fitter = NLLFitter(bg_model) bg_result = bg_fitter.fit(data_bg, calculate_corr=False) # bg_model = ff.Model(bg_pdf, ['a1', 'a2', 'a3']) # bg_model.set_bounds([(-1., 1.), (-1., 1.), (-1., 1.)]) # bg_fitter = ff.NLLFitter(bg_model, data_bg) # bg_result = bg_fitter.fit([0.0, 0.0, 0.0]) # sig_model = ff.Model(sig_pdf, ['mu', 'sigma']) # sig_model.set_bounds([(110, 130), (1, 5)]) # sig_fitter = ff.NLLFitter(sig_model, data_sig) # sig_result = sig_fitter.fit([120.0, 2]) sig_params = Parameters() sig_params.add_many( ('mu', 125, True, 110, 130, None, None), ('sigma', 1, True, 1, 5, None, None), ) sig_model = Model(sig_pdf, sig_params) sig_fitter = NLLFitter(sig_model) sig_result = sig_fitter.fit(data_sig) n_bg = len(data_bg) be_bg = bayesian_blocks(data_bg, p0=0.02) be_sig = bayesian_blocks(data_sig, p0=0.02) return bg_result, sig_result, n_bg, n_sig, be_bg, be_sig
def generate_initial_params(data_bg_mul2, data_bg_mul8, seed=5): # fit to the data distributions bg_model = ff.Model(bg_pdf, ['alpha', 'beta', 'gamma']) bg_model.set_bounds([(1e-20, 20), (-10, -1e-20), (1e-20, 10)]) bg_fitter = ff.NLLFitter(bg_model, data_bg_mul2) bg_result = bg_fitter.fit([-1.80808e+01, -8.21174e-02, 8.06289e-01]) n_bg = len(data_bg_mul8) gRandom.SetSeed(seed) # Set up bg sampling bg_pdf_ROOT = functools.partial(bg_pdf, doROOT=True) tf1_bg_pdf = TF1("tf1_bg_pdf", bg_pdf_ROOT, 2800, 13000, 3) tf1_bg_pdf.SetParameters(*bg_result.x) mc_bg = [tf1_bg_pdf.GetRandom() for i in range(n_bg)] be_bg = bayesian_blocks(mc_bg, p0=0.02) be_bg[-1] += 0.1 be_bg = np.append(be_bg, [13000]) be_bg[0] = 2800 # print be_bg # hist(data_bg_mul8, bins=be_bg, scale='binwidth') # plt.show() return bg_result, n_bg, be_bg
def generate_initial_params(data_bg_mul2, data_bg_mul8, seed=5): # fit to the data distributions bg_params = Parameters() bg_params.add_many( ('alpha', -1.80808e+01, True, 1e-20, 20, None, None), ('beta', -8.21174e-02, True, -10, -1e-20, None, None), ('gamma', 8.06289e-01, True, 1e-20, 10, None, None) ) bg_model = Model(bg_pdf, bg_params) bg_fitter = NLLFitter(bg_model) bg_result = bg_fitter.fit(data_bg_mul2, calculate_corr=False) n_bg = len(data_bg_mul8) gRandom.SetSeed(seed) # Set up bg sampling bg_pdf_ROOT = functools.partial(bg_pdf, doROOT=True) tf1_bg_pdf = TF1("tf1_bg_pdf", bg_pdf_ROOT, 2800, 13000, 3) tf1_bg_pdf.SetParameters(*bg_result.x) mc_bg = [tf1_bg_pdf.GetRandom() for i in range(n_bg)] be_bg = bayesian_blocks(mc_bg, p0=0.02) be_bg[-1] += 0.1 be_bg = np.append(be_bg, [13000]) be_bg[0] = 2800 # print be_bg # hist(data_bg_mul8, bins=be_bg, scale='binwidth') # plt.show() return bg_result, n_bg, be_bg
def generateToy(): plt.close('all') def poly1(x): return 2*x/100 nentries = 100 p0=0.01 x = np.arange(0.0, 10, 0.1) np.random.seed(12345) ROOT.gRandom.SetSeed(8675309) poly1_gen = TF1("poly1","2*x",0,10) my_rands = [] for i in range(nentries): my_rands.append(poly1_gen.GetRandom()) fig = plt.figure() hist(my_rands,bins=10,histtype='stepfilled',alpha=0.2,label='10 bins',normed=True) bb_edges = bayesian_blocks(my_rands,p0=p0) hist(my_rands,bins=bb_edges,histtype='stepfilled',alpha=0.2,label='10 bins',normed=True) plt.plot(x,poly1(x),'k') plt.show()
def generateToy(): plt.close('all') def poly1(x): return 2*x/100 nentries = 100 p0=0.01 x = np.arange(0.0, 10, 0.1) np.random.seed(12345) ROOT.gRandom.SetSeed(8675309) poly1_gen = TF1("poly1","2*x",0,10) my_rands = [] for i in xrange(nentries): my_rands.append(poly1_gen.GetRandom()) fig = plt.figure() hist(my_rands,bins=10,histtype='stepfilled',alpha=0.2,label='10 bins',normed=True) bb_edges = bayesian_blocks(my_rands,p0=p0) hist(my_rands,bins=bb_edges,histtype='stepfilled',alpha=0.2,label='10 bins',normed=True) plt.plot(x,poly1(x),'k') plt.show()
def do_bh_analysis(): #set up variables plt.close('all') normed = True log = True STs = [2, 3, 4, 5, 6, 7, 8, 9, 10] ST_low = [2300, 2300, 2300, 2600, 2600, 2600, 2800, 2800, 2900] ST_low = [2500] * 9 ST_low_dict = dict(list(zip(STs, ST_low))) samples = 5000 seed = 2 p0 = 0.005 bg_est = 'data_driven' #'data_driven','mc','low_ST' mode = 'signal_search' #'no_signal','signal_search','signal_inj','signal_search_inj' if mode not in [ 'no_signal', 'signal_search', 'signal_inj', 'signal_search_inj' ]: raise KeyError('mode is not allowed!') if bg_est not in ['data_driven', 'mc', 'low_ST']: raise KeyError('bg_est is not allowed!') if mode in ['signal_search', 'signal_inj', 'signal_search_inj']: signal_num = 10 else: signal_num = 0 df_mc = pkl.load(open('../../files/BH/BHTree_mc.p', 'rb')) df_signal = pkl.load(open('../../files/BH/BHTree_signal.p', 'rb')) df_data = pkl.load(open('../../files/BH/BHTree_data.p', 'rb')) weights = df_mc.weightTree.unique() #[0.27436519,0.0401976,0.01657276] df_mc_list = [] for weight in weights: df_mc_list.append(df_mc[np.isclose(df_mc.weightTree, weight)]) all_edges = {} #for ST in range(2,11): for ST in [8]: my_ST_data = df_data[df_data['ST_mul' + str(ST)] > ST_low_dict[ST]][ 'ST_mul' + str(ST)].values nentries = len(my_ST_data) my_ST_mc = [] if bg_est == 'low_ST': my_ST_mc = df_data[df_data['ST_mul2'] > ST_low_dict[ST]][ df_data['n_multiplicity'] == 2]['ST_mul2'].values else: df_mc_st_list = [ df[df['ST_mul' + str(ST)] > ST_low_dict[ST]]['ST_mul' + str(ST)] for df in df_mc_list ] if mode in ['signal_search', 'signal_inj', 'signal_search_inj']: my_ST_signal = df_signal[ df_signal['ST_mul' + str(ST)] > ST_low_dict[ST]]['ST_mul' + str(ST)] samples, rel_weights = find_sample_number(df_mc_st_list, weights) for i, mc in enumerate(df_mc_st_list): if samples * rel_weights[i] == 0: continue my_ST_mc = np.append( my_ST_mc, mc.sample(int(samples * rel_weights[i]), random_state=seed).values) print('ST_mult', ST) print(' n_data', nentries) print(' n_mc', len(my_ST_mc)) #get the edges from bb, and the normalized bin values (integral of all hists is 1) #if signal and inject: # my_ST_data = np.append(my_ST_data,my_ST_signal. if mode in ['signal_inj', 'signal_search_inj']: my_ST_data = np.append( my_ST_data, my_ST_signal.sample(signal_num, random_state=seed).values) nentries += signal_num elif mode in ['signal_search']: my_ST_signal = my_ST_signal.values return my_ST_data, my_ST_mc, my_ST_signal print(len(my_ST_data)) normed_counts_data, bb_edges = np.histogram(my_ST_data, bayesian_blocks(my_ST_data, p0=p0), density=True) normed_counts_data_nobb, nobb_edges = np.histogram(my_ST_data, 20, density=True) normed_counts_mc, _ = np.histogram(my_ST_mc, bb_edges, density=True) normed_counts_mc_nobb, _ = np.histogram(my_ST_mc, nobb_edges, density=True) if mode in ['signal_search', 'signal_search_inj']: normed_counts_signal, _ = np.histogram(my_ST_signal, bb_edges, density=True) normed_counts_signal_nobb, _ = np.histogram(my_ST_signal, nobb_edges, density=True) #rescale the values so that the integral of the data hist is = num of entries rescaled_counts_data = normed_counts_data * nentries rescaled_counts_data_nobb = normed_counts_data_nobb * nentries if mode in ['signal_search', 'signal_search_inj']: rescaled_counts_mc = normed_counts_mc * (nentries - signal_num) rescaled_counts_mc_nobb = normed_counts_mc_nobb * (nentries - signal_num) rescaled_counts_signal = normed_counts_signal * signal_num rescaled_counts_signal_nobb = normed_counts_signal_nobb * signal_num else: rescaled_counts_mc = normed_counts_mc * (nentries) rescaled_counts_mc_nobb = normed_counts_mc_nobb * (nentries) #properly calculate the error bars on the data counts_data, _ = np.histogram(my_ST_data, bb_edges) counts_data_nobb, _ = np.histogram(my_ST_data, nobb_edges) rescaled_err = np.sqrt(counts_data) / (bb_edges[1:] - bb_edges[:-1]) rescaled_err_nobb = np.sqrt(counts_data_nobb) / (nobb_edges[1:] - nobb_edges[:-1]) err = np.sqrt(counts_data) #properly account for the BG error for ratio plot counts_bg, _ = np.histogram(my_ST_mc, bb_edges) counts_bg_nobb, _ = np.histogram(my_ST_mc, nobb_edges) rescaled_err_bg = np.sqrt(counts_bg) / (bb_edges[1:] - bb_edges[:-1]) rescaled_err_bg_nobb = np.sqrt(counts_bg_nobb) / (nobb_edges[1:] - nobb_edges[:-1]) if mode in ['signal_search', 'signal_search_inj']: make_hist_ratio_blackhole(bb_edges, rescaled_counts_data, rescaled_counts_mc, rescaled_err, str(ST), suffix=None, bg_est=bg_est, signal=rescaled_counts_signal, mode=mode) make_hist_ratio_blackhole2(nobb_edges, rescaled_counts_data_nobb, rescaled_counts_mc_nobb, rescaled_err_nobb, str(ST), suffix='nobb', bg_est=bg_est, signal=rescaled_counts_signal_nobb, mode=mode) else: make_hist_ratio_blackhole(bb_edges, rescaled_counts_data, rescaled_counts_mc, rescaled_err, str(ST), suffix=None, bg_est=bg_est, mode=mode) make_hist_ratio_blackhole(nobb_edges, rescaled_counts_data_nobb, rescaled_counts_mc_nobb, rescaled_err_nobb, str(ST), suffix='nobb', bg_est=bg_est, mode=mode) plt.show() all_edges[ST] = bb_edges for key in all_edges: print('ST' + str(key), all_edges[key]) return all_edges
binned_A_100_mle = [[] for i in range(len(sig_params))] binned_A_200_mle = [[] for i in range(len(sig_params))] binned_A_400_mle = [[] for i in range(len(sig_params))] binned_A_1000_mle = [[] for i in range(len(sig_params))] binned_A_2000_mle = [[] for i in range(len(sig_params))] cnc_A_mle = [[] for i in range(len(sig_params))] sig_pdf_ROOT = functools.partial(sig_pdf, doROOT=True) tf1_sig_pdf = TF1("tf1_sig_pdf", sig_pdf_ROOT, 2800, 13000, 2) for i, sig_p in enumerate(tqdm_notebook(sig_params, desc='Signal Model')): n_sig = n_bg tf1_sig_pdf.SetParameters(*sig_p) mc_sig = [tf1_sig_pdf.GetRandom() for ns in range(n_sig)] be_sig = bayesian_blocks(mc_sig, p0=0.02) true_sig_bc_bb = get_true_bin_content(be_bg, sig_pdf, sig_p) true_sig_bc_50GeV = get_true_bin_content(be_50GeV, sig_pdf, sig_p) true_sig_bc_100GeV = get_true_bin_content(be_100GeV, sig_pdf, sig_p) true_sig_bc_200GeV = get_true_bin_content(be_200GeV, sig_pdf, sig_p) true_sig_bc_400GeV = get_true_bin_content(be_400GeV, sig_pdf, sig_p) true_sig_bc_1000GeV = get_true_bin_content(be_1000GeV, sig_pdf, sig_p) true_sig_bc_2000GeV = get_true_bin_content(be_2000GeV, sig_pdf, sig_p) be_hybrid = np.sort(np.unique(np.concatenate([be_bg, be_sig]))) true_bg_bc_bb_hybrid = get_true_bin_content(be_hybrid, bg_pdf, bg_result.x) true_sig_bc_bb_hybrid = get_true_bin_content(be_hybrid, sig_pdf, sig_p)
def do_bh_analysis(): #set up variables plt.close('all') normed = True log = True STs = [2,3,4,5,6,7,8,9,10] ST_low = [2300,2300,2300,2600,2600,2600,2800,2800,2900] ST_low = [2500]*9 ST_low_dict = dict(zip(STs,ST_low)) samples = 5000 seed = 2 p0=0.005 bg_est = 'data_driven' #'data_driven','mc','low_ST' mode = 'signal_search' #'no_signal','signal_search','signal_inj','signal_search_inj' if mode not in ['no_signal','signal_search','signal_inj','signal_search_inj']: raise KeyError('mode is not allowed!') if bg_est not in ['data_driven','mc','low_ST']: raise KeyError('bg_est is not allowed!') if mode in ['signal_search','signal_inj','signal_search_inj']: signal_num = 10 else: signal_num = 0 df_mc = pkl.load(open('../../files/BH/BHTree_mc.p','rb')) df_signal = pkl.load(open('../../files/BH/BHTree_signal.p','rb')) df_data = pkl.load(open('../../files/BH/BHTree_data.p','rb')) weights = df_mc.weightTree.unique()#[0.27436519,0.0401976,0.01657276] df_mc_list = [] for weight in weights: df_mc_list.append(df_mc[np.isclose(df_mc.weightTree,weight)]) all_edges = {} #for ST in range(2,11): for ST in [8]: my_ST_data = df_data[df_data['ST_mul'+str(ST)]>ST_low_dict[ST]]['ST_mul'+str(ST)].values nentries = len(my_ST_data) my_ST_mc = [] if bg_est == 'low_ST': my_ST_mc = df_data[df_data['ST_mul2']>ST_low_dict[ST]][df_data['n_multiplicity']==2]['ST_mul2'].values else: df_mc_st_list = [df[df['ST_mul'+str(ST)]>ST_low_dict[ST]]['ST_mul'+str(ST)] for df in df_mc_list] if mode in ['signal_search','signal_inj','signal_search_inj']: my_ST_signal = df_signal[df_signal['ST_mul'+str(ST)]>ST_low_dict[ST]]['ST_mul'+str(ST)] samples,rel_weights = find_sample_number(df_mc_st_list,weights) for i,mc in enumerate(df_mc_st_list): if samples*rel_weights[i]==0: continue my_ST_mc = np.append(my_ST_mc, mc.sample(int(samples*rel_weights[i]),random_state=seed).values) print 'ST_mult',ST print ' n_data',nentries print ' n_mc',len(my_ST_mc) #get the edges from bb, and the normalized bin values (integral of all hists is 1) #if signal and inject: # my_ST_data = np.append(my_ST_data,my_ST_signal. if mode in ['signal_inj','signal_search_inj']: my_ST_data = np.append(my_ST_data, my_ST_signal.sample(signal_num,random_state=seed).values) nentries+=signal_num elif mode in ['signal_search']: my_ST_signal = my_ST_signal.values return my_ST_data, my_ST_mc, my_ST_signal print len(my_ST_data) normed_counts_data, bb_edges = np.histogram(my_ST_data,bayesian_blocks(my_ST_data,p0=p0), density=True) normed_counts_data_nobb, nobb_edges = np.histogram(my_ST_data,20, density=True) normed_counts_mc, _= np.histogram(my_ST_mc,bb_edges, density=True) normed_counts_mc_nobb, _= np.histogram(my_ST_mc,nobb_edges, density=True) if mode in ['signal_search','signal_search_inj']: normed_counts_signal, _= np.histogram(my_ST_signal,bb_edges, density=True) normed_counts_signal_nobb, _= np.histogram(my_ST_signal,nobb_edges, density=True) #rescale the values so that the integral of the data hist is = num of entries rescaled_counts_data = normed_counts_data*nentries rescaled_counts_data_nobb = normed_counts_data_nobb*nentries if mode in ['signal_search','signal_search_inj']: rescaled_counts_mc = normed_counts_mc*(nentries-signal_num) rescaled_counts_mc_nobb = normed_counts_mc_nobb*(nentries-signal_num) rescaled_counts_signal = normed_counts_signal*signal_num rescaled_counts_signal_nobb = normed_counts_signal_nobb*signal_num else: rescaled_counts_mc = normed_counts_mc*(nentries) rescaled_counts_mc_nobb = normed_counts_mc_nobb*(nentries) #properly calculate the error bars on the data counts_data, _= np.histogram(my_ST_data,bb_edges) counts_data_nobb, _= np.histogram(my_ST_data,nobb_edges) rescaled_err = np.sqrt(counts_data)/(bb_edges[1:]-bb_edges[:-1]) rescaled_err_nobb = np.sqrt(counts_data_nobb)/(nobb_edges[1:]-nobb_edges[:-1]) err = np.sqrt(counts_data) #properly account for the BG error for ratio plot counts_bg, _= np.histogram(my_ST_mc,bb_edges) counts_bg_nobb, _= np.histogram(my_ST_mc,nobb_edges) rescaled_err_bg = np.sqrt(counts_bg)/(bb_edges[1:]-bb_edges[:-1]) rescaled_err_bg_nobb = np.sqrt(counts_bg_nobb)/(nobb_edges[1:]-nobb_edges[:-1]) if mode in ['signal_search','signal_search_inj']: make_hist_ratio_blackhole(bb_edges, rescaled_counts_data, rescaled_counts_mc, rescaled_err, str(ST), suffix = None, bg_est=bg_est, signal = rescaled_counts_signal, mode = mode) make_hist_ratio_blackhole2(nobb_edges, rescaled_counts_data_nobb, rescaled_counts_mc_nobb, rescaled_err_nobb, str(ST), suffix = 'nobb', bg_est=bg_est, signal = rescaled_counts_signal_nobb, mode=mode) else: make_hist_ratio_blackhole(bb_edges, rescaled_counts_data, rescaled_counts_mc, rescaled_err, str(ST), suffix = None, bg_est=bg_est, mode=mode) make_hist_ratio_blackhole(nobb_edges, rescaled_counts_data_nobb, rescaled_counts_mc_nobb, rescaled_err_nobb, str(ST), suffix = 'nobb', bg_est=bg_est, mode=mode) plt.show() all_edges[ST]=bb_edges for key in all_edges: print 'ST'+str(key), all_edges[key] return all_edges