def main(data_path, key): events = read_h5py(data_path, key='events', columns=columns) theta2_cuts = np.arange(0.1, 0.0, -0.001) prediction_thresholds = np.arange(0.75, 1, 0.001) max_significance = 0 selected = events for threshold in tqdm(prediction_thresholds): selected = selected.query('gamma_prediction >= {}'.format(threshold)) theta2_on = selected.theta_deg**2 theta2_off = pd.concat( [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)])**2 for theta2_cut in theta2_cuts: theta2_on = theta2_on[theta2_on <= theta2_cut] theta2_off = theta2_off[theta2_off <= theta2_cut] n_on = len(theta2_on) n_off = len(theta2_off) sig = li_ma_significance(n_on, n_off, 0.2) if sig >= max_significance: max_significance = sig best_threshold = threshold best_theta2_cut = theta2_cut print('Threshold:', best_threshold) print('θ² cut: ', best_theta2_cut) print('Li&Ma :', max_significance)
def calculate_significance(signal_events, background_events, theta_cut, alpha=0.2): n_on, _, n_off, _ = calculate_n_on_n_off(signal_events, background_events, theta_cut, alpha=alpha) return li_ma_significance(n_on, n_off, alpha=alpha)
def find_best_prediction_cut(prediction_cuts, signal_events, background_events, angular_resolution, alpha=1, silent=False): rs = [] for pc in tqdm(prediction_cuts, disable=silent): m = signal_events.gamma_prediction_mean >= pc selected_signal = signal_events[m] m = background_events.gamma_prediction_mean >= pc selected_background = background_events[m] theta_cut = angular_resolution( selected_signal.gamma_energy_prediction_mean) n_signal, n_signal_count = calculate_n_signal(signal_events, theta_cut) theta_cut = angular_resolution( selected_background.gamma_energy_prediction_mean) n_off, n_off_count, total_bkg_counts = calculate_n_off( background_events, theta_cut, alpha=alpha) n_on = n_signal + alpha * n_off n_on_count = n_signal_count + alpha * n_off_count relative_sensitivity = find_relative_sensitivity(n_signal, n_off, alpha=alpha) significance = li_ma_significance(n_on, n_off, alpha=alpha) # valid = check_validity(n_signal_count, n_off_count, alpha=alpha) valid = check_validity(n_signal, n_off, total_bkg_counts=total_bkg_counts, alpha=alpha) if not valid: significance = 0 relative_sensitivity = np.inf rs.append( [relative_sensitivity, significance, pc, n_on_count, n_off_count]) relative_sensitivities = np.array([r[0] for r in rs]) significances = np.array([r[1] for r in rs]) if (significances == 0).all(): return np.nan, np.nan, np.nan max_index = np.nanargmin(relative_sensitivities) best_relative_sensitivity, best_significance, best_prediction_cut, on_counts, off_counts = rs[ max_index] return best_prediction_cut, best_significance, best_relative_sensitivity
def main(gammas, protons, output): t_obs = 50 * u.h gammas = fact.io.read_data(gammas, key='array_events') gammas = gammas.dropna() gamma_runs = fact.io.read_data(gammas, key='runs') mc_production_gamma = MCSpectrum.from_cta_runs(gamma_runs) protons = fact.io.read_data(protons, key='array_events') protons = protons.dropna() # print(f'Plotting {len(protons)} protons and {len(gammas)} gammas.') proton_runs = fact.io.read_data(protons, key='runs') mc_production_proton = MCSpectrum.from_cta_runs(proton_runs) crab = CrabSpectrum() cosmic = CosmicRaySpectrum() gammas['weight'] = mc_production_gamma.reweigh_to_other_spectrum( crab, gammas.mc_energy.values * u.TeV, t_assumed_obs=t_obs) protons['weight'] = mc_production_proton.reweigh_to_other_spectrum( cosmic, protons.mc_energy.values * u.TeV, t_assumed_obs=t_obs) # gammas_gammalike = gammas.query(f'gamma_prediction_mean > {cut}') # protons_gammalike = protons.query(f'gamma_prediction_mean > {cut}') bin_edges, _, _ = make_energy_bins(gammas.mc_energy.values * u.TeV, bins=20) on, off, alpha = coordinates.split_on_off(gammas, protons, on_region_radius=0.4 * u.deg) print(f'alpha:{alpha}') on['energy_bin'] = pd.cut(on.mc_energy, bin_edges) off['energy_bin'] = pd.cut(off.mc_energy, bin_edges) for ((_, g_on), (_, g_off)) in zip(on.groupby('energy_bin'), off.groupby('energy_bin')): n_on = g_on.weight.sum() n_off = g_off.weight.sum() print('----' * 20) print(n_on, n_off) print(g_on.size, g_off.size) print(li_ma_significance(n_on, n_off, alpha=1)) if output: plt.savefig(output) else: plt.show()
def model_significance(estimator, data): ''' Evaluate significance on given trained model and given datset. Parameters: estimator: sklearn.model Trained model, so there the estimator can make predictions on the dataset. data: pd.DataFrame The dataset where the siginificance should be calculated Returns: max(significance): float Maximal signigicance on the dataset by given model. ''' feature = load_feature() data['gamma_prediction'] = estimator.predict_proba(data[feature])[:,1] significance = [] for threshold in np.linspace(0.01, 0.99, 99): on_data, off_data = split_on_off_source_independent( data.query('gamma_prediction >'+threshold.astype(str)), theta2_cut=0.03) significance.append(li_ma_significance(len(on_data), len(off_data), 0.2)) return max(significance)
def plot_significance(estimator, data, save=True, path= 'significance.pdf'): ''' Plot the significance in dependence to threshold. Parameters: estimator: sklearn.model Trained model, so there the estimator can make predictions on the dataset. data: pd.DataFrame The dataset where the siginificance should be calculated ''' feature = load_feature() data['gamma_prediction'] = estimator.predict_proba(data[feature])[:,1] significance = [] for threshold in np.linspace(0.01, 0.99, 99): on_data, off_data = split_on_off_source_independent( data.query('gamma_prediction >'+threshold.astype(str)), theta2_cut=0.03) significance.append(li_ma_significance(len(on_data), len(off_data), 0.2)) plt.plot(np.linspace(0.01, 0.99, 99), significance) if(save==True): plt.title('max('+str(round(max(significance),2))+')') plt.xlabel('threshold') plt.ylabel('confidence') plt.savefig(path)
def main(predictions, threshold, theta_cut, net): bins = 40 alpha = 0.2 limits = [0, 0.3] df = fio.read_data(predictions, key='events') print(df.columns) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) if net: print('using cnn predictions') selected = df.query('predictions_convnet > {}'.format(threshold)) ax.set_title('Neural Net predictions') else: print('using standard predictions') selected = df.query('gamma_prediction > {}'.format(threshold)) ax.set_title('RF predictions') theta_on = selected.theta_deg theta_off = pd.concat( [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)]) h_on, bin_edges = np.histogram(theta_on.apply(lambda x: x**2).values, bins=bins, range=limits) h_off, bin_edges, _ = ax.hist( theta_off.apply(lambda x: x**2).values, bins=bin_edges, range=limits, weights=np.full(len(theta_off), 0.2), histtype='stepfilled', color='lightgray', ) bin_center = bin_edges[1:] - np.diff(bin_edges) * 0.5 bin_width = np.diff(bin_edges) ax.errorbar( bin_center, h_on, yerr=np.sqrt(h_on) / 2, xerr=bin_width / 2, linestyle='', label='On', ) ax.errorbar( bin_center, h_off, yerr=alpha * np.sqrt(h_off) / 2, xerr=bin_width / 2, linestyle='', label='Off', color='darkgray', ) ax.axvline(theta_cut**2, color='black', alpha=0.3, linestyle='--') n_on = np.sum(theta_on < theta_cut) n_off = np.sum(theta_off < theta_cut) significance = li_ma_significance(n_on, n_off, alpha=alpha) print('N_on', n_on) print('N_off', n_off) print('Li&Ma: {}'.format(significance)) ax.text( 0.5, 0.95, stats_box_template.format( n_on=n_on, n_off=n_off, alpha=alpha, n_excess=n_on - alpha * n_off, n_excess_err=np.sqrt(n_on + alpha**2 * n_off), significance=significance, ), transform=ax.transAxes, va='top', ha='center', ) ax.set_xlim(*limits) ax.legend(loc='lower right') fig.tight_layout(pad=0) plt.show()
def main(data_path, threshold, theta2_cut, key, bins, alpha, start, end, preliminary, ymax, config, output): ''' Given the DATA_PATH to a data hdf5 file (e.g. the output of ERNAs gather scripts) this script will create the infamous theta square plot. This plot shows the events of (selected gamma-like) events which have been reconstructed as coming from the source region and the one coming from a (more or less abritrary) off region. In a traditional IACT analysis this plot is used to calculate the significance of detection. The HDF files are expected to a have a group called 'runs' and a group called 'events' The events group has to have the columns: 'theta', 'theta_deg_off_1', 'theta_deg_off_2', 'theta_deg_off_3', 'theta_deg_off_4', 'theta_deg_off_5', If a prediction threshold is to be used, also 'gamma_prediction', must be in the group. The 'gamma_prediction' column can be added to the data using 'klaas_apply_separation_model' for example. ''' if config: with open(config) as f: plot_config.update(yaml.safe_load(f)) theta_cut = np.sqrt(theta2_cut) if threshold > 0.0: columns.append('gamma_prediction') events = read_h5py(data_path, key='events', columns=columns) if start or end: events['timestamp'] = read_timestamp(data_path) try: runs = read_h5py(data_path, key='runs') runs['run_start'] = pd.to_datetime(runs['run_start']) runs['run_stop'] = pd.to_datetime(runs['run_stop']) except IOError: runs = pd.DataFrame( columns=['run_start', 'run_stop', 'ontime', 'source']) if start is not None: events = events.query('timestamp >= @start') runs = runs.query('run_start >= @start') if end is not None: events = events.query('timestamp <= @end') runs = runs.query('run_stop <= @end') if threshold > 0: selected = events.query('gamma_prediction >= {}'.format(threshold)) else: selected = events theta_on = selected.theta_deg theta_off = pd.concat( [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)]) del events max_theta2 = 0.3 width = max_theta2 / bins rounded_width = theta2_cut / np.round(theta2_cut / width) bins = np.arange(0, max_theta2 + 0.1 * rounded_width, rounded_width) print('Using {} bins to get theta_cut on a bin edge'.format(len(bins) - 1)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) h_on, bin_edges = np.histogram( theta_on.apply(lambda x: x**2).values, bins=bins, ) h_off, bin_edges, _ = ax.hist( theta_off.apply(lambda x: x**2).values, bins=bin_edges, weights=np.full(len(theta_off), 0.2), histtype='stepfilled', color='lightgray', zorder=0, ) bin_center = bin_edges[1:] - np.diff(bin_edges) * 0.5 bin_width = np.diff(bin_edges) ax.errorbar( bin_center, h_on, yerr=np.sqrt(h_on), xerr=bin_width / 2, linestyle='', label='On', ) ax.errorbar(bin_center, h_off, yerr=alpha * np.sqrt(h_off), xerr=bin_width / 2, linestyle='', label='Off', zorder=1) ax.axvline(theta_cut**2, color='black', alpha=0.3, linestyle='--') n_on = np.sum(theta_on < theta_cut) n_off = np.sum(theta_off < theta_cut) significance = li_ma_significance(n_on, n_off, alpha=alpha) print('N_on', n_on) print('N_off', n_off) print('Li&Ma: {}'.format(significance)) ax.text( 0.5, 0.95, stats_box_template.format( source=runs.source.iloc[0] if len(runs) > 0 else '', t_obs=runs.ontime.sum() / 3600, n_on=n_on, n_off=n_off, alpha=alpha, n_excess=n_on - alpha * n_off, n_excess_err=np.sqrt(n_on + alpha**2 * n_off), significance=significance, ), transform=ax.transAxes, va='top', ha='center', ) if preliminary: add_preliminary( plot_config['preliminary_position'], size=plot_config['preliminary_size'], color=plot_config['preliminary_color'], ax=ax, ) if ymax: ax.set_ylim(0, ymax) ax.set_xlim(0, bins.max()) ax.set_xlabel(plot_config['xlabel']) ax.legend(loc=plot_config['legend_loc']) fig.tight_layout(pad=0) if output: fig.savefig(output, dpi=300) else: plt.show()
def theta_square_plot(theta2_cut=0.8, data_path=plotting_path, key='events', start=None, end=None, threshold=0.5, bins=40, alpha=0.2, output=False): import pandas as pd import matplotlib.pyplot as plt import numpy as np import h5py from dateutil.parser import parse as parse_date from fact.io import read_h5py from fact.analysis import ( li_ma_significance, split_on_off_source_dependent, ) import click columns = [ 'gamma_prediction', 'theta_deg', 'theta_deg_off_1', 'theta_deg_off_2', 'theta_deg_off_3', 'theta_deg_off_4', 'theta_deg_off_5', 'unix_time_utc', ] stats_box_template = r'''Source: {source}, $t_\mathrm{{obs}} = {t_obs:.2f}\,\mathrm{{h}}$ $N_\mathrm{{On}} = {n_on}$, $N_\mathrm{{Off}} = {n_off}$, $\alpha = {alpha}$ $N_\mathrm{{Exc}} = {n_excess:.1f} \pm {n_excess_err:.1f}$, $S_\mathrm{{Li&Ma}} = {significance:.1f}\,\sigma$ ''' theta_cut = np.sqrt(theta2_cut) with h5py.File(data_path, 'r') as f: source_dependent = 'gamma_prediction_off_1' in f[key].keys() if source_dependent: print('Separation was using source dependent features') columns.extend('gamma_prediction_off_' + str(i) for i in range(1, 6)) theta_cut = np.inf theta2_cut = np.inf events = read_h5py(data_path, key='events', columns=columns) events['timestamp'] = pd.to_datetime( events['unix_time_utc_0'] * 1e6 + events['unix_time_utc_1'], unit='us', ) runs = read_h5py(data_path, key='runs') runs['run_start'] = pd.to_datetime(runs['run_start']) runs['run_stop'] = pd.to_datetime(runs['run_stop']) if start is not None: events = events.query('timestamp >= @start') runs = runs.query('run_start >= @start') if end is not None: events = events.query('timestamp <= @end') runs = runs.query('run_stop <= @end') if source_dependent: on_data, off_data = split_on_off_source_dependent(events, threshold) theta_on = on_data.theta_deg theta_off = off_data.theta_deg else: selected = events.query('gamma_prediction >= {}'.format(threshold)) theta_on = selected.theta_deg theta_off = pd.concat( [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)]) del events if source_dependent: limits = [ 0, max( np.percentile(theta_on, 99)**2, np.percentile(theta_off, 99)**2), ] else: limits = [0, 0.3] fig = plt.figure() ax = fig.add_subplot(1, 1, 1) h_on, bin_edges = np.histogram(theta_on.apply(lambda x: x**2).values, bins=bins, range=limits) h_off, bin_edges, _ = ax.hist( theta_off.apply(lambda x: x**2).values, bins=bin_edges, range=limits, weights=np.full(len(theta_off), 0.2), histtype='stepfilled', color='lightgray', ) bin_center = bin_edges[1:] - np.diff(bin_edges) * 0.5 bin_width = np.diff(bin_edges) ax.errorbar( bin_center, h_on, yerr=np.sqrt(h_on) / 2, xerr=bin_width / 2, linestyle='', label='On', ) ax.errorbar( bin_center, h_off, yerr=alpha * np.sqrt(h_off) / 2, xerr=bin_width / 2, linestyle='', label='Off', ) if not source_dependent: ax.axvline(theta_cut**2, color='gray', linestyle='--') n_on = np.sum(theta_on < theta_cut) n_off = np.sum(theta_off < theta_cut) significance = li_ma_significance(n_on, n_off, alpha=alpha) ax.text( 0.5, 0.95, stats_box_template.format( source='Crab', t_obs=83.656, n_on=n_on, n_off=n_off, alpha=alpha, n_excess=n_on - alpha * n_off, n_excess_err=np.sqrt(n_on + alpha**2 * n_off), significance=significance, ), transform=ax.transAxes, fontsize=12, va='top', ha='center', ) ax.set_xlabel(r'$(\theta / {}^\circ )^2$') ax.legend() fig.tight_layout() plt.xlim(0.0, 0.3) if output: fig.savefig(output, dpi=300) else: #plt.show() pass
def main(data_path, gamma_path, corsika_path, config_template, output_base, threshold, theta2_cut, gamma_fraction, title, start, end, zd_min, zd_max): with h5py.File(data_path, 'r') as f: source_dependent = 'gamma_prediction_off_1' in f['events'].keys() if source_dependent: other_columns.extend(bg_prediction_columns) theta_cut = np.inf theta2_cut = np.inf print('Source dependent separation, ignoring theta cut') theta_cut = np.sqrt(theta2_cut) data = read_h5py(data_path, key='events', columns=data_columns + output_columns + other_columns) gammas = read_h5py( gamma_path, key='events', columns=mc_columns + output_columns + other_columns, ) gammas.rename( columns={'corsika_evt_header_total_energy': 'true_energy'}, inplace=True, ) runs = read_h5py(data_path, key='runs') data['timestamp'] = pd.to_datetime( data['unix_time_utc_0'] * 1e6 + data['unix_time_utc_1'], unit='us', ) if start: data = data.query('timestamp >= @start') runs = runs.query('run_start >= @start') if end: data = data.query('timestamp <= @end') runs = runs.query('run_start <= @end') min_zenith = runs.zenith.min() max_zenith = runs.zenith.max() if zd_min: min_zenith = max(min_zenith, zd_min) if zd_max: max_zenith = min(max_zenith, zd_max) print('Zenith range of the input data:', min_zenith, max_zenith) if source_dependent: on_data, off_data = split_on_off_source_dependent(data, threshold) on_gammas = gammas.query('gamma_prediction >= {}'.format(threshold)) else: on_data, off_data = split_on_off_source_independent( data.query('gamma_prediction >= {}'.format(threshold)), theta2_cut=theta2_cut, ) on_gammas = gammas.query( '(theta_deg <= {}) & (gamma_prediction >= {})'.format( theta_cut, threshold, )) query = '(zd_tracking >= {}) and (zd_tracking <= {})'.format( min_zenith, max_zenith) on_gammas = on_gammas.query(query).copy() output_columns.append('theta_deg') on_gammas = on_gammas.loc[:, output_columns + ['true_energy']] on_data = on_data.loc[:, output_columns + data_columns] off_data = off_data.loc[:, output_columns + data_columns] off_data['weight'] = 0.2 on_data['weight'] = 1.0 on_gammas['weight'] = 1.0 rpd.to_root(on_data, output_base + '_on.root', key='events') rpd.to_root(off_data, output_base + '_off.root', key='events') rpd.to_root(on_gammas, output_base + '_mc.root', key='events') print('N_on: {}'.format(len(on_data))) print('N_off: {}'.format(len(off_data))) print('S(Li&Ma): {}'.format( li_ma_significance(len(on_data), len(off_data), 0.2))) print('N_mc: {}'.format(len(on_gammas))) n_excess = len(on_data) - 0.2 * len(off_data) fraction = n_excess / len(on_gammas) print('N_excess:', n_excess) print('Fraction: {:1.4f}'.format(fraction)) with open(config_template) as f: template = f.read() t_obs = runs.ontime.sum() try: corsika = pd.read_hdf(corsika_path, key='table') except KeyError: f = h5py.File(corsika_path) print("given key not in file: possible keys are: {}".format( list(f.keys()))) return corsika['zenith'] = np.rad2deg(corsika['zenith']) corsika = corsika.query('(zenith >= {}) and (zenith <= {})'.format( min_zenith, max_zenith)) print('Simulated events after zenith cut: {}'.format(len(corsika))) config = template.format( t_obs=t_obs, selection_fraction=gamma_fraction, n_gamma=len(corsika), source_file_on=output_base + '_on.root', source_file_off=output_base + '_off.root', source_file_mc=output_base + '_mc.root', tree_name='events', output_file=output_base + '_result.root', fraction=fraction, min_zenith=min_zenith, max_zenith=max_zenith, title=title, ) with open(output_base + '.config', 'w') as f: f.write(config)
mc_Tree.fit(mc_data.drop('label', axis=1), mc_data.label) mc_xgbc.fit(mc_data.drop('label', axis=1), mc_data.label) pred_mess_tree = mess_Tree.predict_proba(eval_data[feature])[:, 1] pred_mess_xgbc = mess_xgbc.predict_proba(eval_data[feature])[:, 1] pred_mc_tree = mc_Tree.predict_proba(eval_data[feature])[:, 1] pred_mc_xgbc = mc_xgbc.predict_proba(eval_data[feature])[:, 1] sig_mess_tree = [] sig_mess_xgbc = [] sig_mc_tree = [] sig_mc_xgbc = [] for threshold in np.linspace(0.01, 0.99, 99): on_data, off_data = split_on_off_source_independent( eval_data[threshold <= pred_mess_tree], theta2_cut=0.03) sig_mess_tree.append(li_ma_significance(len(on_data), len(off_data), 0.2)) on_data, off_data = split_on_off_source_independent( eval_data[threshold <= pred_mess_xgbc], theta2_cut=0.03) sig_mess_xgbc.append(li_ma_significance(len(on_data), len(off_data), 0.2)) on_data, off_data = split_on_off_source_independent( eval_data[threshold <= pred_mc_tree], theta2_cut=0.03) sig_mc_tree.append(li_ma_significance(len(on_data), len(off_data), 0.2)) on_data, off_data = split_on_off_source_independent( eval_data[threshold <= pred_mc_xgbc], theta2_cut=0.03) sig_mc_xgbc.append(li_ma_significance(len(on_data), len(off_data), 0.2)) data = pd.DataFrame({ 'sig_mess_tree': np.transpose(sig_mess_tree), 'sig_mess_xgbc': np.transpose(sig_mess_xgbc), 'sig_mc_tree': np.transpose(sig_mc_tree), 'sig_mc_xgbc': np.transpose(sig_mc_xgbc)
from fact.analysis import li_ma_significance, split_on_off_source_independent from fact.io import read_data df = read_data('crab_gammas_dl3.hdf5', key='events') on, off = split_on_off_source_independent( df.query('gamma_prediction > 0.85'), 0.025, ) with open('build/significance.tex', 'w') as f: f.write(r'\SI{') f.write( '{:.1f}'.format(li_ma_significance(len(on), len(off), 0.2)) ) f.write(r'}{σ}')
def _target(scaling_factor, n_signal, n_background, alpha=0.2, sigma=5): n_on = n_background * alpha + n_signal * scaling_factor n_off = n_background significance = li_ma_significance(n_on, n_off, alpha=alpha) return (sigma - significance)**2