def plot_reweighted(reweighted_vars, DATAS_df, MC_df): import matplotlib.pyplot as plt from matplotlib_hep import histpoints # Plot the reweighted variables for id, col in enumerate(reweighted_vars, 1): #print id, col #print MC_df.name plt.figure() if col=='B_PT': range =(0, 50000) #plt.xlim(0.0, 50000) if col =='nTracks': range=(0,600) else: range=None data_col = DATAS_df[col].values mc_col = MC_df[col].values #mc_rewcol= MC_df['weights_{}'.format(col)].values y_data, y_data, norm_data = histpoints(data_col, range=range, color='black', normed=True, bins=100, \ label='sWeighted DATA', markersize=3. , marker='o', weights=DATAS_df['nSig_sw']) x_uncorr, y_uncorr, norm_uncorr = histpoints(mc_col, range=range, color='red', normed=True, bins=100, label='MC', markersize=3., marker='o') x_corr, y_corr, norm_corr = histpoints(mc_col, range=range, color= 'green', normed=True, bins=100, weights=MC_df['wghts_tot'], markersize=3., \ label='Reweighted MC', marker='o') plt.xlabel(vars_to_reweight_dict[col][0]) plt.ylabel('Events') plt.legend(prop={'size': 10}) plt.show()
def main(): logger = logging.getLogger(__name__) logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=logging.DEBUG) logger.debug(matplotlib.matplotlib_fname()) args = docopt(__doc__) path = args["<datafile>"] out = args["<outputfile>"] tablename = args["--tablename"] bins = int(args["--bins"]) title = args["--title"] first_night = args["--first"] last_night = args["--last"] theta_cut = float(args['--theta2-cut']) prediction_threshold = float(args['--threshold']) alpha = 0.2 df = pd.read_hdf(path, key=tablename) df.columns = [c.replace(':', '_') for c in df.columns] if first_night: df = df.query('(NIGHT >= {})'.format(first_night)).copy() logger.info('Using only Data begining with night {}'.format(first_night)) if last_night: df = df.query('(NIGHT <= {})'.format(last_night)).copy() logger.info('Using only Data until night {}'.format(last_night)) night_stats = df.NIGHT.describe() actual_first_night = night2datetime(int(night_stats['min'])) actual_last_night = night2datetime(int(night_stats['max'])) period = 'Period: {:%Y-%m-%d} to {:%Y-%m-%d}'.format(actual_first_night, actual_last_night) logger.debug('Using Nights from {}'.format(period)) theta_keys = ["Theta"] + ['Theta_Off_{}'.format(off_position) for off_position in range(1, 6)] df[theta_keys] = df[theta_keys].apply(theta_mm_to_theta_squared_deg, axis=0) # best_significance = 0 # prediction_threshold = 0 # theta_cut = 0 # for threshold in np.linspace(0.5, 1, 20): # df_signal = df.query('(prediction_on > {})'.format(threshold)) # df_background = df.query('(background_prediction > {})'.format(threshold)) # # # signal_theta = df_signal['signal_theta'].values # background_theta = df_background['background_theta'].values # theta_cuts = np.linspace(0.5, 0.001, 50) # for theta_cut in theta_cuts: # n_off = len(background_theta[background_theta < theta_cut]) # n_on =len(signal_theta[signal_theta < theta_cut]) # significance = li_ma_significance(n_on, n_off, alpha=alpha) # if significance > best_significance: # theta_cut = theta_cut # best_significance = significance # prediction_threshold = threshold best_significance = 0 best_prediction_threshold = 0 best_theta_cut = 0 for threshold in np.linspace(0.5, 1, 20): t_on = df['Theta'][df['prediction_on' ] > threshold] t_off = pd.Series() for off_position in range(1, 6): mask = df['prediction_off_{}'.format(off_position)] > threshold t_off = t_off.append(df['Theta_Off_{}'.format(off_position)][mask]) for t_cut in np.linspace(0.5, 0.001, 50): n_on = len(t_on[t_on < t_cut]) n_off = len(t_off[t_off < t_cut]) significance = li_ma_significance(n_on, n_off, alpha=alpha) if significance > best_significance: best_theta_cut = t_cut best_significance = significance best_prediction_threshold = threshold logger.info('Maximum Significance {}, for Theta Cut at {} and confidence cut at {}'.format(best_significance, best_theta_cut, best_prediction_threshold)) theta_on = df['Theta'][df['prediction_on' ] > prediction_threshold] theta_off = pd.Series() for off_position in range(1, 6): mask = df['prediction_off_{}'.format(off_position)] > prediction_threshold theta_off = theta_off.append(df['Theta_Off_{}'.format(off_position)][mask]) n_on = len(theta_on[theta_on < theta_cut]) n_off = len(theta_off[theta_off < theta_cut]) logger.info('N_on = {}, N_off = {}'.format(n_on, n_off)) excess_events = n_on - alpha * n_off significance = li_ma_significance(n_on, n_off, alpha=alpha) logger.info( 'Chosen cuts for prediction threshold {} has signifcance: {} with a theta sqare cut of {}.'.format( prediction_threshold, significance, theta_cut )) theta_max = 0.3 bins = np.linspace(0, theta_max, bins) #Define measures for plot and info box info_left = 0.0 info_height = 0.3 info_width = 1.0 info_bottom = 1. info_top = info_bottom + info_height info_right = info_left + info_width plot_height = 1. - info_height plot_width = theta_max fig = plt.figure() fig.subplots_adjust(top=plot_height) ax = fig.gca() #Plot the Theta2 Distributions sig_x, sig_y, sig_norm = histpoints(theta_on, bins=bins, xerr='binwidth', label='On', fmt='none', ecolor='b', capsize=0) back_x, back_y, back_norm = histpoints(theta_off, bins=bins, xerr='binwidth', label='Off', fmt='none', ecolor='r', capsize=0, scale=alpha, yerr='sqrt') #Fill area underneeth background ax.fill_between(back_x, back_y[1], 0, facecolor='grey', alpha=0.2, linewidth=0.0) #Mark theta cut with a line0.5*(info_left+info_right), ax.axvline(x=theta_cut, linewidth=1, color='k', linestyle='dashed') # embed() # Draw info Box p = patches.Rectangle( (info_left, 1.), info_width, info_height, fill=True, transform=ax.transAxes, clip_on=False, facecolor='0.9', edgecolor='black') ax.add_patch(p) info_text = 'Significance: {:.2f}, Alpha: {:.2f}\n'.format(significance, alpha) if period: info_text = period + ',\n' + info_text info_text += 'Confidence Cut: {:.2f}, Theta Sqare Cut: {:.2f} \n'.format(prediction_threshold, theta_cut) info_text += '{:.2f} excess events, {:.2f} background events \n'.format(excess_events, n_off) ax.text(0.5*(info_left+info_right), 0.5*(info_top+info_bottom)-0.05, info_text, horizontalalignment='center', verticalalignment='center', fontsize=10, transform=ax.transAxes) ax.text(0.5*(info_left+info_right), (info_top), title, bbox={'facecolor':'white', 'pad':10}, horizontalalignment='center', verticalalignment='center', fontsize=14, color='black', transform=ax.transAxes) # hist_background, edges , _ = plt.hist(df_background.values, bins=edges, alpha=0.6, label='Off region') # plt.xlabel("$//Theta^2 in mm^2$") plt.xlabel("Theta^2 / mm^2") plt.ylabel("Counts") plt.legend(fontsize=12) # plt.show() plt.savefig(out)
def main(outputfile, datatupels, ignorekeys, cuts, default_cuts): '''Plot Data MonteCarlo comparison plots from HDF5 files''' cuts = aggregatePlottingCuts(cuts, default_cuts) df_list, datafiles, scales, labels, common_keys = loadData(datatupels, cuts) if ignorekeys != None: common_keys = set(common_keys).difference(ignorekeys) for key in ignorekeys: logger.info("skipping column{}: on ignore list".format(key)) picturePath = mkDirAtDestination(outputfile) with PdfPages(os.path.join(picturePath, os.path.basename(outputfile))) as pdf: logger.info("\nList of Keys:") for key in common_keys: logger.info(key) #skip tupples if isinstance(df_list[0][key].iloc[0], (list, tuple)): logger.info("skipping column {}: cannot interprete content".format(key)) continue fig = plt.figure() plt.title(key) plot_option = None if key in default_plots: plot_option = default_plots[key] if plot_option == False: plt.close() continue data_range = calcDataRange(df_list, key) gc.collect() logger.info(default_plot_option) xlabel = key func = None xUnit="" if plot_option == None: plot_option = default_plot_option else: # embed() func = plot_option["func"] xUnit = plot_option["xUnit"] xlabel += " / " + xUnit if func and func.__name__ and not "lambda" in func.__name__: # embed() func_name = str(func.__name__) logger.info("Function: {}({})".format(func_name, key)) xlabel = func_name+"({})".format(xlabel) del plot_option["func"] del plot_option["xUnit"] plot_option = merge_dicts(default_plot_option, plot_option) try: if "bins" and "range" in plot_option: if not plot_option["range"] == None: plot_option["bins"] = np.linspace(*plot_option["range"], plot_option["bins"]) else: plot_option["bins"] = np.linspace(*data_range, plot_option["bins"]) except: embed() for df, scale, label, c in zip(df_list, scales, labels, color_cycle()): data = df[key] if func: data = func(data) try: # plt.hist(data.values, label=df["filename"].iloc[0], normed=scale, color=c["color"], **plot_option) ax = fig.gca() ax.grid(True) x, y, norm = histpoints(data.values, xerr='binwidth', yerr="sqrt", label=label, fmt='none', capsize=0, normed=scale, ecolor=c["color"], **plot_option) ax.fill_between(x, y[1], 0, alpha=0.2, linewidth=0.01, step='mid', facecolor=c["color"]) if "log" in plot_option: if plot_option["log"]: ax.set_yscale("log", nonposy='clip') if "range" in plot_option: ax.set_xlim(plot_option["range"]) except Exception: logger.exception("Plotting failed for {} in file {}".format(key, df["filename"])) plt.xlabel(xlabel) plt.ylabel("Frequency") plt.legend(loc='best') plt.savefig(os.path.join(picturePath, key+".png")) # plt.show() pdf.savefig() plt.close() # We can also set the file's metadata via the PdfPages object: d = pdf.infodict() d['Title'] = 'Data MC Comparison plots' d['Author'] = u'Jens Buss' d['Subject'] = 'Comparison' d['Keywords'] = 'Data:{}\nCuts:{}'.format(str(", ".join(datafiles)), str(cuts)) d['CreationDate'] = datetime.datetime.today() d['ModDate'] = datetime.datetime.today()
range_x = (np.min(mc_norm[col]), np.max(DATASNorm[col])) range_y = (np.min(mc_norm[col]), np.max(mc_norm[col])) range_ = () if np.max(mc_norm[col]) > np.max(mc_norm[col]): range_ = range_y print(range_x, range_) elif np.max(mc_norm[col]) > np.max(DATASNorm[col]): range_ = range_x print(range_y, range_) y_data, y_data, norm_data = histpoints(data_col, range=range_, color='black', normed=True, bins=100, label='sWeighted DATA', markersize=3., marker='o', weights=DATASNorm['nSig_sw']) x_uncorr, y_uncorr, norm_uncorr = histpoints(mc_col, range=range_, color='red', normed=True, bins=100, label='MC', markersize=3., marker='o') x_corr, y_corr, norm_corr = histpoints(