def main(path, cuts, threshold): config = 'configs/data_mc.yaml' if cuts: precuts = '_precuts' else: precuts = '' print('Read in data...') gamma = read_h5py(path + '/gamma{}.hdf5'.format(precuts), key='events') proton = read_h5py(path + '/proton{}.hdf5'.format(precuts), key='events') data = read_h5py(path + '/crab_data{}.hdf5'.format(precuts), key='events') print('Done...') if config is not None: with open(config) as f: config = yaml.safe_load(f) else: config = {} print(config) fig = plt.figure() ax_hist = fig.add_subplot(1, 1, 1) keys = set(data.columns).intersection(set(proton.columns)) wech = [ 'run', 'event', 'pointing_position_az', 'pointing_position_zd', 'theta_deg_off_1', 'theta_deg_off_2', 'theta_deg_off_3', 'theta_deg_off_4', 'theta_deg_off_5' ] for key in wech: if key in keys and key in set(gamma.columns): keys.remove(key) dfs = OrderedDict([ ('data', data), ('proton', proton), ('gamma', gamma), ]) # Dict for histograms h = {} print('Plotting keys...') with PdfPages('{}/pdf/feature_comp_cuts.pdf'.format(path)) as pdf: for key in sorted(list(keys)): print(key) kwargs = config.get(key, {}) if 'transform' in kwargs: kwargs['transform'] = eval(kwargs['transform']) ax_hist.cla() plot_histograms(dfs, key, h, threshold, ax=ax_hist, **kwargs) pdf.savefig(fig)
def main(infile): with h5py.File(infile, mode='r') as f: is_simulation = 'corsika_runs' in f if is_simulation: df = read_h5py(infile, key='events', columns=columns_sim) obstime = None else: df = read_h5py(infile, key='events', columns=columns_obs) obstime = Time(df.dragon_time, format='unix') altaz = AltAz(obstime=obstime, location=location) pointing = SkyCoord( alt=u.Quantity(df.alt_tel.values, u.rad, copy=False), az=u.Quantity(df.az_tel.values, u.rad, copy=False), frame=altaz, ) camera_frame = CameraFrame(telescope_pointing=pointing, location=location, obstime=obstime, focal_length=28 * u.m) prediction_cam = SkyCoord( x=u.Quantity(df.source_x_prediction.values, u.m, copy=False), y=u.Quantity(df.source_y_prediction.values, u.m, copy=False), frame=camera_frame, ) prediction_altaz = prediction_cam.transform_to(altaz) append_column_to_hdf5(infile, prediction_altaz.alt.rad, 'events', 'source_alt_prediction') append_column_to_hdf5(infile, prediction_altaz.az.rad, 'events', 'source_az_prediction') if not is_simulation: prediction_icrs = prediction_altaz.transform_to('icrs') pointing_icrs = pointing.transform_to('icrs') append_column_to_hdf5(infile, prediction_icrs.ra.rad, 'events', 'source_ra_prediction') append_column_to_hdf5(infile, prediction_icrs.dec.rad, 'events', 'source_dec_prediction') append_column_to_hdf5(infile, pointing_icrs.ra.rad, 'events', 'pointing_ra') append_column_to_hdf5(infile, pointing_icrs.dec.rad, 'events', 'pointing_dec')
def theta_cut( path_gamma, path_hadron, theta_cut, length=None, path_feature='/home/msackel/Desktop/gammaClassification/config/feature.yaml' ): ''' Read to concanate features from *.yaml file. And load the data from the given files. Hadron file added theta_deg feature to make theta cut on data. ''' with open(path_feature) as f: feature = yaml.load(f) gamma_data = pd.read_hdf(path_gamma, key='events')[feature] hadron_data = read_h5py(path_hadron, key='events', columns=feature + ['theta_deg']) ''' Theta cut on hadron data and label data. ''' hadron_data = hadron_data[hadron_data['theta_deg']**2 >= theta_cut] hadron_data['label'] = 0 gamma_data['label'] = 1 ''' Length is set to the minimal lenght of the both data and a concat dataset is returned. ''' if (length == None): length = min([len(hadron_data), len(gamma_data)]) return pd.concat( [hadron_data.drop('theta_deg', axis=1)[:length], gamma_data[:length]])
def main(data_path, key): events = read_h5py(data_path, key='events', columns=columns) theta2_cuts = np.arange(0.1, 0.0, -0.001) prediction_thresholds = np.arange(0.75, 1, 0.001) max_significance = 0 selected = events for threshold in tqdm(prediction_thresholds): selected = selected.query('gamma_prediction >= {}'.format(threshold)) theta2_on = selected.theta_deg**2 theta2_off = pd.concat( [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)])**2 for theta2_cut in theta2_cuts: theta2_on = theta2_on[theta2_on <= theta2_cut] theta2_off = theta2_off[theta2_off <= theta2_cut] n_on = len(theta2_on) n_off = len(theta2_off) sig = li_ma_significance(n_on, n_off, 0.2) if sig >= max_significance: max_significance = sig best_threshold = threshold best_theta2_cut = theta2_cut print('Threshold:', best_threshold) print('θ² cut: ', best_theta2_cut) print('Li&Ma :', max_significance)
def read_timestamp(path): try: timestamp = read_h5py(path, key='events', columns=['timestamp']) timestamp = pd.to_datetime(timestamp['timestamp']) except KeyError: try: col = 'unix_time_utc' unix_time_utc = read_h5py(path, key='events', columns=[col]) timestamp = pd.to_datetime( unix_time_utc[col + '_0'] * 1e6 + unix_time_utc[col + '_1'], unit='us', ) except KeyError: raise KeyError( 'File contains neither "timestamp" nor "unix_time_utc"') return timestamp
def test_to_h5py(): from fact.io import to_h5py, read_h5py df = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) with tempfile.NamedTemporaryFile() as f: to_h5py(df, f.name, key='test') with h5py.File(f.name, 'r') as hf: assert 'test' in hf.keys() g = hf['test'] assert 'x' in g.keys() assert 'N' in g.keys() df2 = read_h5py(f.name, key='test') df2.sort_index(1, inplace=True) df.sort_index(1, inplace=True) assert all(df.dtypes == df2.dtypes) assert all(df['x'] == df2['x']) assert all(df['N'] == df2['N'])
def load_gamma_subset(sourcefile, theta2_cut=0.0, conf_cut=0.9, num_off_positions=1, analysis_type='classic', with_runs=False): events = read_h5py(sourcefile, key='events') selection_columns = ['theta_deg', 'gamma_prediction', 'zd_tracking', 'conc_core'] theta_off_columns = ['theta_deg_off_{}'.format(i) for i in range(1, num_off_positions)] bg_prediction_columns = ['gamma_prediction_off_{}'.format(i) for i in range(1, num_off_positions)] if analysis_type == 'source': log.info('\tSelection events for source dependent analysis') log.info("\t\tgamma_pred_cut={0:.2f}".format(conf_cut)) on_data, off_data = split_on_off_source_dependent( events=events, prediction_threshold=conf_cut, on_prediction_key='gamma_prediction', off_prediction_keys=bg_prediction_columns) on_mc = events.query('gamma_prediction >= {}'.format(conf_cut)) elif analysis_type == 'classic': log.info('\tSelection events for source independent analysis') log.info("\t\tgamma_pred_cut={0:.2f}".format(conf_cut)) log.info("\t\ttheta2_cut={0:.2f}".format(theta2_cut)) on_data, off_data = split_on_off_source_independent( events=events.query('gamma_prediction >= {}'.format(conf_cut)), theta2_cut=theta2_cut, theta_key='theta_deg', theta_off_keys=theta_off_columns) on_mc = events.query( '(theta_deg <= {}) & (gamma_prediction >= {})'.format( theta2_cut, conf_cut)) log.info("\t{} Data Events (on region)".format(len(on_data))) log.info("\t\t{} Data Events ({} off regions)".format(len(off_data), num_off_positions)) log.info("\t{} MC gammas after selection".format(len(on_mc))) if with_runs: runs = read_h5py(sourcefile, key='runs') t_obs = runs.ontime.sum() n_events_per_off_region = len(off_data) / num_off_positions n_events_on_region = len(on_data) n_events_expected_signal = n_events_on_region - n_events_per_off_region return on_mc, on_data, off_data
def create_mask(input_file, mask_config): columns = list(mask_config.keys()) df = read_h5py(input_file, key='events', columns=columns) mask = np.ones(len(df), dtype='bool') for key, (op, val) in mask_config.items(): mask &= OPERATORS[op](df[key], val) return mask
def test_write_lists_h5py(): from fact.io import to_h5py, read_h5py df = pd.DataFrame({'x': [[1.0, 2.0], [3.0, 4.0]]}) with tempfile.NamedTemporaryFile(suffix='.hdf5') as f: to_h5py(df, f.name) df = read_h5py(f.name, columns=['x']) assert df['x_0'].iloc[0] == 1.0
def test_hdf5(): from erna.io import Writer with tempfile.NamedTemporaryFile(prefix='erna_test_', suffix='.hdf5') as f: with Writer(f.name) as writer: assert writer.fmt == 'hdf5' for i in range(n_dfs): writer.append(random_df(n_rows)) df = read_h5py(f.name, key='events') assert len(df) == n_rows * n_dfs
def read_dfs_for_column(datasets, column, masks=None): dfs = [] for d, dataset in enumerate(datasets): if 'parts' in dataset: parts = [] for p, part in enumerate(dataset['parts']): df = read_h5py( part['path'], key='events', columns=[column] ) if masks is not None: mask = masks[d][p] df = df.loc[mask].copy() parts.append(df) dfs.append(parts) else: df = read_h5py(dataset['path'], key='events', columns=[column]) if masks is not None: mask = masks[d] df = df.loc[mask].copy() dfs.append(df) return dfs
def read_file(infile): log.debug(f"Reading {infile}") events = read_h5py(infile, key='events', columns=list(COLUMN_MAP.keys())) sim_runs = read_h5py(infile, key='corsika_runs') events.rename(columns=COLUMN_MAP, inplace=True) n_showers = np.sum(sim_runs.num_showers * sim_runs.shower_reuse) log.debug(f"Number of events from corsika_runs: {n_showers}") sim_info = SimulatedEventsInfo( n_showers=n_showers, energy_min=u.Quantity(sim_runs["energy_range_min"][0], u.TeV), energy_max=u.Quantity(sim_runs["energy_range_max"][0], u.TeV), max_impact=u.Quantity(sim_runs["max_scatter_range"][0], u.m), spectral_index=sim_runs["spectral_index"][0], viewcone=u.Quantity( sim_runs["max_viewcone_radius"][0] - sim_runs["min_viewcone_radius"][0], u.deg), ) return table.QTable.from_pandas(events, units=UNIT_MAP), sim_info
def test_to_h5py_string(): from fact.io import to_h5py, read_h5py df = pd.DataFrame({ 'name': ['Mrk 501', 'Mrk 421', 'Crab'], }) with tempfile.NamedTemporaryFile() as f: to_h5py(df, f.name, key='test') df2 = read_h5py(f.name, key='test') assert all(df.dtypes == df2.dtypes) assert all(df['name'] == df2['name'])
def test_to_h5py_append_second_group(): from fact.io import to_h5py, read_h5py df1 = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) df2 = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) with tempfile.NamedTemporaryFile() as f: to_h5py(df1, f.name, key='g1', index=False) to_h5py(df2, f.name, key='g2', index=False) df_g1 = read_h5py(f.name, key='g1') df_g2 = read_h5py(f.name, key='g2') for col in df_g1.columns: assert all(df_g1[col] == df1[col]) for col in df_g2.columns: assert all(df_g2[col] == df2[col])
def read_run_calculate_thetas(run, columns, threshold, source: SkyCoord, n_offs): df = read_h5py(run, key='events', columns=columns) ontime = calc_ontime(df).to(u.hour) if type(threshold) == float: df_selected = df.query(f'gammaness > {threshold}') else: df['selected_gh'] = evaluate_binned_cut( df.gammaness.to_numpy(), df.gamma_energy_prediction.to_numpy() * u.TeV, threshold, operator.ge) df_selected = df.query('selected_gh') location = EarthLocation.from_geodetic(-17.89139 * u.deg, 28.76139 * u.deg, 2184 * u.m) obstime = Time(df_selected.dragon_time, format='unix') altaz = AltAz(obstime=obstime, location=location) pointing = SkyCoord( alt=u.Quantity(df_selected.alt_tel.values, u.rad, copy=False), az=u.Quantity(df_selected.az_tel.values, u.rad, copy=False), frame=altaz, ) pointing_icrs = pointing.transform_to('icrs') prediction_icrs = SkyCoord(df_selected.source_ra_prediction.values * u.rad, df_selected.source_dec_prediction.values * u.rad, frame='icrs') theta, theta_off = calc_theta_off( source_coord=source, reco_coord=prediction_icrs, pointing_coord=pointing_icrs, n_off=n_offs, ) # generate df containing corresponding energies etc for theta_off df_selected5 = df_selected for i in range(n_offs - 1): df_selected5 = df_selected5.append(df_selected) return df_selected, ontime, theta, df_selected5, theta_off
def init(self): self.dl2_file = read_h5py( self.dl2_file, key="events", columns=[ "event_num", "run_id", "night", "source_position_az", "source_position_zd", "source_position_x", "source_position_y", "cog_x", "cog_y", "timestamp", "pointing_position_az", "pointing_position_zd", ], )
def test_to_h5py_append(): from fact.io import to_h5py, read_h5py df1 = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) df2 = pd.DataFrame({ 'x': np.random.normal(size=50), 'N': np.random.randint(0, 10, dtype='uint8') }) with tempfile.NamedTemporaryFile() as f: to_h5py(df1, f.name, key='test', index=False) to_h5py(df2, f.name, key='test', mode='a', index=False) df_read = read_h5py(f.name, key='test') df_written = pd.concat([df1, df2], ignore_index=True) for col in df_written.columns: assert all(df_read[col] == df_written[col])
def test_to_h5py_datetime(): from fact.io import to_h5py, read_h5py df = pd.DataFrame({ 't_ns': pd.date_range('2017-01-01', freq='1ns', periods=100), 't_us': pd.date_range('2017-01-01', freq='1us', periods=100), 't_ms': pd.date_range('2017-01-01', freq='1ms', periods=100), 't_s': pd.date_range('2017-01-01', freq='1s', periods=100), 't_d': pd.date_range('2017-01-01', freq='1d', periods=100), }) with tempfile.NamedTemporaryFile() as f: to_h5py(df, f.name, key='test') df2 = read_h5py(f.name, key='test') for col in df.columns: assert all(df[col] == df2[col])
def read_data(file_path, key=None, columns=None, first=None, last=None, **kwargs): """ This is similar to the read_data function in fact.io pandas hdf5: pd.HDFStore h5py hdf5: fact.io.read_h5py """ _, extension = os.path.splitext(file_path) if extension in [".hdf", ".hdf5", ".h5"]: try: df = pd.read_hdf( file_path, key=key, columns=columns, start=first, stop=last, **kwargs ) except (TypeError, ValueError): df = read_h5py( file_path, key=key, columns=columns, first=first, last=last, **kwargs ) return df else: raise NotImplementedError( f"AICT tools cannot handle data with extension {extension} yet." )
ax.errorbar( binned['center'], binned[key], xerr=0.5 * binned['width'], label=label, linestyle='', ) ax.legend() ax.set_xscale('log') ax.set_xlabel( rf'$\log_{{10}}(E_{{\mathrm{{MC}}}} \,\, / \,\, \mathrm{{{energy_unit}}})$' ) return ax gamma_2_150 = read_h5py('../build/dl2_gamma_south_pointing_20200706_v0.5.2_local_DL1_testing.h5', key = 'events') gamma_2_300 = read_h5py('../HDD/build_scaling_300/dl2_gamma_south_pointing_20200706_v0.5.2_local_DL1_testing.h5', key = 'events') gamma_1_150 = read_h5py('../HDD/build_noscaling/dl2_gamma_south_pointing_20200514_v0.5.1_v01_DL1_testing.h5', key = 'events') gamma_1_300 = read_h5py('../HDD/build_noscaling_300/dl2_gamma_south_pointing_20200514_v0.5.1_v01_DL1_testing.h5', key = 'events') gammaness_threshold = 0.6 figures = [] figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) plotting.angular_res(gamma_1_150, 'mc_energy', ax, label='v0.5.1 and intensity > 150') plotting.angular_res(gamma_1_300, 'mc_energy', ax, label='v0.5.1 and intensity > 300') plotting.angular_res(gamma_2_150, 'mc_energy', ax, label='v0.5.2 and intensity > 150') plotting.angular_res(gamma_2_300, 'mc_energy', ax, label='v0.5.2 and intensity > 300') #ax.set_title('All events')
def main(outdir, gamma_diff_file, gamma_file, output): offs = [ f'{outdir}/dl2_v0.5.1_LST-1.Run01837.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01840.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01841.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01842.h5' ] ons = [ f'{outdir}/dl2_v0.5.1_LST-1.Run01832.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01833.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01834.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01835.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01836.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01843.h5', f'{outdir}/dl2_v0.5.1_LST-1.Run01844.h5' ] df_off = pd.DataFrame() for i, run in enumerate(offs): df_off = pd.concat( [df_off, read_h5py(run, key='events', columns=columns)], ignore_index=True) df_on = pd.DataFrame() for i, run in enumerate(ons): df_on = pd.concat( [df_on, read_h5py(run, key='events', columns=columns)], ignore_index=True) gamma_diff = read_h5py(gamma_diff_file, key='events') gamma = read_h5py(gamma_file, key='events') figures = [] theta2_cut = 0.04 gammaness_threshold = 0.6 #theta2 camera center figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) plotting.theta2(df_on, theta2_cut, gammaness_threshold, df_off, ax) #ax.set_title('Crab camera center, total-time scaling') figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) plotting.theta2(df_on, theta2_cut, gammaness_threshold, df_off, ax, alpha='manuel') #ax.set_title('Crab camera center, furthest $50\%$ scaling') #crab coordinates on_pointing = [] for i, run in enumerate(ons): df = read_h5py(run, key='events', columns=columns) on_pointing.append(df) #figures.append(plt.figure()) #ax = figures[-1].add_subplot(1, 1, 1) #plotting.plot2D_runs(on_pointing, ons, 'crab', gammaness_threshold, ax) # #figures.append(plt.figure()) #ax = figures[-1].add_subplot(1, 1, 1) #plotting.plot2D(df_on, gammaness_threshold, ax) figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) plotting.theta2(df_on, 0.1, gammaness_threshold, df_off, ax, coord='crab') ax.set_title('Crab coordinates, total-time scaling') figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) plotting.theta2(df_on, 0.1, gammaness_threshold, df_off, ax, alpha='manuel', coord='crab') ax.set_title('Crab coordinates, furthest $50\%$ scaling') #test plots figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) ax.hist(gamma_diff.disp_prediction, bins=100, histtype='step') ax.set_xlabel('disp prediction') ax.set_title('gamma-diffuse testing') figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) ax.hist(gamma_diff.gammaness, bins=100, histtype='step') ax.set_xlabel('gammaness') ax.set_title('gamma-diffuse testing') #figures.append(plt.figure()) #ax = figures[-1].add_subplot(1, 1, 1) #plotting.theta2(gamma_diff, theta2_cut, gammaness_threshold, ax=ax, range=None) #ax.set_title('gamma-diffuse testing') #angular resolustion figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) plotting.angular_res(gamma, 'mc_energy', ax) ax.set_title('Angular resolution (no cuts)') figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) gamma['sign_prediction'] = np.sign(gamma.disp_prediction) gamma_cuts = gamma.query('sign_prediction == disp_sign') gamma_cuts = gamma_cuts.query(f'gammaness > {gammaness_threshold}') plotting.angular_res(gamma_cuts, 'mc_energy', ax) ax.set_title( f'Angular resolution (correct sign prediction & gammaness > {gammaness_threshold})' ) figures.append(plt.figure()) ax = figures[-1].add_subplot(1, 1, 1) plotting.angular_res(gamma, 'mc_energy', ax, label='All events') plotting.angular_res( gamma_cuts, 'mc_energy', ax, label=rf'correct sign and $p_\gamma > {gammaness_threshold}$') #saving with PdfPages(output) as pdf: for fig in figures: fig.tight_layout() pdf.savefig(fig)
def main( config, gamma_file, corsika_file, output_file, obstime, seed, label, threshold, theta2_cut, ): ''' unfold fact simulations ''' setup_logging() log = logging.getLogger('fact_funfolding') log.setLevel(logging.INFO) random_state = np.random.RandomState(seed) np.random.set_state(random_state.get_state()) config = Config.from_yaml(config) e_ref = config.e_ref threshold = threshold or config.threshold theta2_cut = theta2_cut or config.theta2_cut log.info(f'Using threshold {threshold}') log.info(f'Using theta2 cut {theta2_cut}') # define binning in e_est and e_true bins_obs = logspace_binning(config.e_est_low, config.e_est_high, e_ref, config.n_bins_est) bins_true = logspace_binning(config.e_true_low, config.e_true_high, e_ref, config.n_bins_true) # read in files query = 'gamma_prediction > {} and theta_deg**2 < {}'.format( threshold, theta2_cut) gammas = read_h5py(gamma_file, key='events').query(query) with h5py.File(gamma_file, 'r') as f: sample_fraction = f.attrs.get('sample_fraction', 1.0) log.info('Using sampling fraction of {:.3f}'.format(sample_fraction)) query = 'gamma_prediction > {}'.format(threshold) corsika_events = read_h5py( corsika_file, key='corsika_events', columns=['total_energy'], ) simulated_spectrum = read_simulated_spectrum(corsika_file) weights = calc_weights_powerlaw( u.Quantity(gammas['corsika_event_header_total_energy'].values, u.GeV, copy=False), obstime=obstime, n_events=simulated_spectrum['n_showers'], e_min=simulated_spectrum['energy_min'], e_max=simulated_spectrum['energy_max'], simulated_index=simulated_spectrum['energy_spectrum_slope'], scatter_radius=simulated_spectrum['x_scatter'], target_index=HEGRA_INDEX, flux_normalization=HEGRA_NORM, e_ref=HEGRA_E_REF, sample_fraction=sample_fraction, ) # calculate effective area in given binning a_eff, bin_center, bin_width, a_eff_low, a_eff_high = collection_area( corsika_events.total_energy.values, gammas[E_TRUE].values, impact=simulated_spectrum['x_scatter'], bins=bins_true.to_value(u.GeV), sample_fraction=sample_fraction, ) gammas['bin'] = np.digitize(gammas[E_TRUE], bins_true.to(u.GeV).value) # split dataframes in train / test set gammas['test'] = False n_test = np.random.poisson(weights.sum()) idx = np.random.choice(gammas.index, n_test, p=weights / weights.sum()) gammas.loc[idx, 'test'] = True df_test = gammas[gammas.test] df_model = gammas[~gammas.test] X_model = df_model[E_PRED].values y_model = df_model[E_TRUE].values X_test = df_test[E_PRED].values y_test = df_test[E_TRUE].values g_model = np.digitize(X_model, bins_obs.to(u.GeV).value) f_model = np.digitize(y_model, bins_true.to(u.GeV).value) g_test = np.digitize(X_test, bins_obs.to(u.GeV).value) f_test = np.digitize(y_test, bins_true.to(u.GeV).value) model = ff.model.LinearModel(random_state=random_state) model.initialize(digitized_obs=g_model, digitized_truth=f_model) vec_g_test, vec_f_test = model.generate_vectors(digitized_obs=g_test, digitized_truth=f_test) vec_g_model, vec_f_model = model.generate_vectors(digitized_obs=g_model, digitized_truth=f_model) llh = ff.solution.StandardLLH( tau=config.tau, log_f=True, reg_factor_f=1 / a_eff.value[1:-1] if config.tau else None, ) llh.initialize( vec_g=vec_g_test, model=model, ignore_n_bins_low=1, ignore_n_bins_high=1, ) sol_mcmc = ff.solution.LLHSolutionMCMC( n_burn_steps=config.n_burn_steps, n_used_steps=config.n_used_steps, random_state=random_state, ) sol_mcmc.initialize(llh=llh, model=model) sol_mcmc.set_x0_and_bounds(x0=np.random.poisson(vec_f_test)) vec_f_est, sigma_vec_f, sample, probs, autocorr_time = sol_mcmc.fit() additional_features_to_save = dict() additional_features_to_save['a_eff'] = a_eff additional_features_to_save['a_eff_low'] = a_eff_low additional_features_to_save['a_eff_high'] = a_eff_high save_spectrum( output_file, bins_true, vec_f_est / a_eff / bin_width / u.GeV / obstime, sigma_vec_f / a_eff / bin_width / u.GeV / obstime, counts=vec_f_est, counts_err=sigma_vec_f, tau=config.tau, label=label or config.label, add_features=additional_features_to_save, )
def main( configuration_path, data_path, separator_model_path, energy_model_path, disp_model_path, sign_model_path, output, random_source, wobble_distance, key, chunksize, n_jobs, yes, verbose, ): ''' Apply given model to data. Two columns are added to the file, energy_prediction and energy_prediction_std CONFIGURATION_PATH: Path to the config yaml file DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output SEPARATOR_MODEL_PATH: Path to the pickled separation model. ENERGY_MODEL_PATH: Path to the pickled energy regression model. DISP_MODEL_PATH: Path to the pickled disp model. SIGN_MODEL_PATH: Path to the pickled sign model. ''' log = setup_logging() config = AICTConfig.from_yaml(configuration_path) if os.path.isfile(output): if not yes: click.confirm( 'Outputfile {} exists. Overwrite?'.format(output), abort=True, ) open(output, 'w').close() log.info('Loading model') separator_model = load_model(separator_model_path) energy_model = load_model(energy_model_path) disp_model = load_model(disp_model_path) sign_model = load_model(sign_model_path) log.info('Done') if n_jobs: separator_model.n_jobs = n_jobs energy_model.n_jobs = n_jobs disp_model.n_jobs = n_jobs sign_model.n_jobs = n_jobs columns = set(needed_columns) for model in ('separator', 'energy', 'disp'): model_config = getattr(config, model) columns.update(model_config.columns_to_read_apply) try: runs = read_h5py(data_path, key='runs') sources = runs['source'].unique() if len(sources) > 1: raise click.ClickException( 'to_dl3 only supports files with a single source') source = SkyCoord.from_name(sources[0]) columns.update(['timestamp', 'night']) except (KeyError, OSError): source = None columns.update(dl3_columns_sim_read) df_generator = read_telescope_data_chunked( data_path, config, chunksize=chunksize, columns=columns, ) log.info('Predicting on data...') for df, start, end in tqdm(df_generator): df_sep = feature_generation(df, config.separator.feature_generation) df['gamma_prediction'] = predict_separator( df_sep[config.separator.features], separator_model, ) df_energy = feature_generation(df, config.energy.feature_generation) df['gamma_energy_prediction'] = predict_energy( df_energy[config.energy.features], energy_model, log_target=config.energy.log_target, ) df_disp = feature_generation(df, config.disp.feature_generation) disp = predict_disp( df_disp[config.disp.features], disp_model, sign_model, log_target=config.disp.log_target, ) prediction_x = df.cog_x + disp * np.cos(df.delta) prediction_y = df.cog_y + disp * np.sin(df.delta) df['source_x_prediction'] = prediction_x df['source_y_prediction'] = prediction_y df['disp_prediction'] = disp if source: obstime = Time( pd.to_datetime(df['timestamp'].values).to_pydatetime()) source_altaz = concat_results_altaz( parallelize_array_computation( partial(to_altaz, source=source), obstime, n_jobs=n_jobs, )) result = parallelize_array_computation( calc_source_features_obs, prediction_x, prediction_y, source_altaz.zen.deg, source_altaz.az.deg, df['pointing_position_zd'].values, df['pointing_position_az'].values, obstime, n_jobs=n_jobs, ) else: if random_source: zd, az = calc_random_source( df['pointing_position_zd'], df['pointing_position_az'], wobble_distance, ) df['source_position_zd'] = zd df['source_position_az'] = az result = parallelize_array_computation( calc_source_features_sim, prediction_x, prediction_y, df['source_position_zd'].values, df['source_position_az'].values, df['pointing_position_zd'].values, df['pointing_position_az'].values, df['cog_x'].values, df['cog_y'].values, df['delta'].values, project_disp=config.disp.project_disp, n_jobs=n_jobs, ) for k in result[0].keys(): df[k] = np.concatenate([r[k] for r in result]) if source: to_h5py(df[dl3_columns_obs], output, key='events', mode='a') else: to_h5py(df[dl3_columns_sim], output, key='events', mode='a') with h5py.File(data_path, 'r') as f: sample_fraction = f.attrs.get('sample_fraction', 1.0) set_sample_fraction(output, sample_fraction) copy_runs_group(data_path, output)
def main(gamma_path, std, n_bins, threshold, theta2_cut, preliminary, config, output): df = read_h5py( gamma_path, key='events', columns=[ 'gamma_energy_prediction', 'corsika_event_header_total_energy', 'gamma_prediction', 'theta_deg' ], ) if config: with open(config) as f: plot_config.update(yaml.load(f)) if threshold: df = df.query('gamma_prediction >= @threshold').copy() if theta2_cut: df = df.query('theta_deg**2 <= @theta2_cut').copy() fig = plt.figure() ax = fig.add_subplot(1, 1, 1) divider = make_axes_locatable(ax) cax = divider.append_axes('right', size='5%', pad=0.025) ax.set_aspect(1) ax.set_xscale('log') ax.set_yscale('log') e_min = min( df.gamma_energy_prediction.min(), df.corsika_event_header_total_energy.min() ) e_max = max( df.gamma_energy_prediction.max(), df.corsika_event_header_total_energy.max() ) limits = np.log10([e_min, e_max]) bins = np.logspace(limits[0], limits[1], n_bins + 1) hist, xedges, yedges = np.histogram2d( df.corsika_event_header_total_energy.values, df.gamma_energy_prediction.values, bins=bins, ) plot = ax.pcolormesh( xedges, yedges, hist.T, norm=LogNorm() if plot_config['logz'] else None, cmap=plot_config['cmap'], ) plot.set_rasterized(True) fig.colorbar(plot, cax=cax) if preliminary: add_preliminary( plot_config['preliminary_position'], size=plot_config['preliminary_size'], color=plot_config['preliminary_color'], ax=ax, ) ax.set_xlabel(plot_config['xlabel']) ax.set_ylabel(plot_config['ylabel']) fig.tight_layout(pad=0) if output: fig.savefig(output, dpi=300) else: plt.show()
def main( configuration_path, data_path, separator_model_path, energy_model_path, disp_model_path, sign_model_path, output, random_source, wobble_distance, key, chunksize, n_jobs, yes, verbose, ): """ Apply given model to data. Two columns are added to the file, energy_prediction and energy_prediction_std CONFIGURATION_PATH: Path to the config yaml file DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output SEPARATOR_MODEL_PATH: Path to the pickled separation model. ENERGY_MODEL_PATH: Path to the pickled energy regression model. DISP_MODEL_PATH: Path to the pickled disp model. SIGN_MODEL_PATH: Path to the pickled sign model. """ log = setup_logging() config = AICTConfig.from_yaml(configuration_path) if os.path.isfile(output): if not yes: click.confirm( "Outputfile {} exists. Overwrite?".format(output), abort=True, ) open(output, "w").close() log.info("Loading model") separator_model = load_model(separator_model_path) energy_model = load_model(energy_model_path) disp_model = load_model(disp_model_path) sign_model = load_model(sign_model_path) log.info("Done") if n_jobs: separator_model.n_jobs = n_jobs energy_model.n_jobs = n_jobs disp_model.n_jobs = n_jobs sign_model.n_jobs = n_jobs columns = set(needed_columns) for model in ("separator", "energy", "disp"): model_config = getattr(config, model) columns.update(model_config.columns_to_read_apply) try: runs = read_h5py(data_path, key="runs") sources = runs["source"].unique() if len(sources) > 1: raise click.ClickException( "to_dl3 only supports files with a single source") source = SkyCoord.from_name(sources[0]) columns.update(["timestamp", "night"]) except (KeyError, OSError): source = None columns.update(dl3_columns_sim_read) df_generator = read_telescope_data_chunked( data_path, config, chunksize=chunksize, columns=columns, ) log.info("Predicting on data...") for df, start, end in tqdm(df_generator): df_sep = feature_generation(df, config.separator.feature_generation) df["gamma_prediction"] = predict_separator( df_sep[config.separator.features], separator_model, ) df_energy = feature_generation(df, config.energy.feature_generation) df["gamma_energy_prediction"] = predict_energy( df_energy[config.energy.features], energy_model, log_target=config.energy.log_target, ) df_disp = feature_generation(df, config.disp.feature_generation) disp = predict_disp( df_disp[config.disp.features], disp_model, sign_model, log_target=config.disp.log_target, ) prediction_x = df.cog_x + disp * np.cos(df.delta) prediction_y = df.cog_y + disp * np.sin(df.delta) df["source_x_prediction"] = prediction_x df["source_y_prediction"] = prediction_y df["disp_prediction"] = disp if source: obstime = Time(df["timestamp"].to_numpy().astype("U")) source_altaz = concat_results_altaz( parallelize_array_computation( partial(to_altaz, source=source), obstime, n_jobs=n_jobs, )) result = parallelize_array_computation( calc_source_features_obs, prediction_x, prediction_y, source_altaz.zen.deg, source_altaz.az.deg, df["pointing_position_zd"].to_numpy(), df["pointing_position_az"].to_numpy(), obstime, n_jobs=n_jobs, ) else: if random_source: zd, az = calc_random_source( df["pointing_position_zd"], df["pointing_position_az"], wobble_distance, ) df["source_position_zd"] = zd df["source_position_az"] = az result = parallelize_array_computation( calc_source_features_sim, prediction_x, prediction_y, df["source_position_zd"].to_numpy(), df["source_position_az"].to_numpy(), df["pointing_position_zd"].to_numpy(), df["pointing_position_az"].to_numpy(), df["cog_x"].to_numpy(), df["cog_y"].to_numpy(), df["delta"].to_numpy(), project_disp=config.disp.project_disp, n_jobs=n_jobs, ) for k in result[0].keys(): df[k] = np.concatenate([r[k] for r in result]) if source: to_h5py(df[dl3_columns_obs], output, key="events", mode="a") else: to_h5py(df[dl3_columns_sim], output, key="events", mode="a") with h5py.File(data_path, "r") as f: sample_fraction = f.attrs.get("sample_fraction", 1.0) set_sample_fraction(output, sample_fraction) copy_group(data_path, output, "runs") copy_group(data_path, output, "corsika_runs")
def theta_square_plot(theta2_cut=0.8, data_path=plotting_path, key='events', start=None, end=None, threshold=0.5, bins=40, alpha=0.2, output=False): import pandas as pd import matplotlib.pyplot as plt import numpy as np import h5py from dateutil.parser import parse as parse_date from fact.io import read_h5py from fact.analysis import ( li_ma_significance, split_on_off_source_dependent, ) import click columns = [ 'gamma_prediction', 'theta_deg', 'theta_deg_off_1', 'theta_deg_off_2', 'theta_deg_off_3', 'theta_deg_off_4', 'theta_deg_off_5', 'unix_time_utc', ] stats_box_template = r'''Source: {source}, $t_\mathrm{{obs}} = {t_obs:.2f}\,\mathrm{{h}}$ $N_\mathrm{{On}} = {n_on}$, $N_\mathrm{{Off}} = {n_off}$, $\alpha = {alpha}$ $N_\mathrm{{Exc}} = {n_excess:.1f} \pm {n_excess_err:.1f}$, $S_\mathrm{{Li&Ma}} = {significance:.1f}\,\sigma$ ''' theta_cut = np.sqrt(theta2_cut) with h5py.File(data_path, 'r') as f: source_dependent = 'gamma_prediction_off_1' in f[key].keys() if source_dependent: print('Separation was using source dependent features') columns.extend('gamma_prediction_off_' + str(i) for i in range(1, 6)) theta_cut = np.inf theta2_cut = np.inf events = read_h5py(data_path, key='events', columns=columns) events['timestamp'] = pd.to_datetime( events['unix_time_utc_0'] * 1e6 + events['unix_time_utc_1'], unit='us', ) runs = read_h5py(data_path, key='runs') runs['run_start'] = pd.to_datetime(runs['run_start']) runs['run_stop'] = pd.to_datetime(runs['run_stop']) if start is not None: events = events.query('timestamp >= @start') runs = runs.query('run_start >= @start') if end is not None: events = events.query('timestamp <= @end') runs = runs.query('run_stop <= @end') if source_dependent: on_data, off_data = split_on_off_source_dependent(events, threshold) theta_on = on_data.theta_deg theta_off = off_data.theta_deg else: selected = events.query('gamma_prediction >= {}'.format(threshold)) theta_on = selected.theta_deg theta_off = pd.concat( [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)]) del events if source_dependent: limits = [ 0, max( np.percentile(theta_on, 99)**2, np.percentile(theta_off, 99)**2), ] else: limits = [0, 0.3] fig = plt.figure() ax = fig.add_subplot(1, 1, 1) h_on, bin_edges = np.histogram(theta_on.apply(lambda x: x**2).values, bins=bins, range=limits) h_off, bin_edges, _ = ax.hist( theta_off.apply(lambda x: x**2).values, bins=bin_edges, range=limits, weights=np.full(len(theta_off), 0.2), histtype='stepfilled', color='lightgray', ) bin_center = bin_edges[1:] - np.diff(bin_edges) * 0.5 bin_width = np.diff(bin_edges) ax.errorbar( bin_center, h_on, yerr=np.sqrt(h_on) / 2, xerr=bin_width / 2, linestyle='', label='On', ) ax.errorbar( bin_center, h_off, yerr=alpha * np.sqrt(h_off) / 2, xerr=bin_width / 2, linestyle='', label='Off', ) if not source_dependent: ax.axvline(theta_cut**2, color='gray', linestyle='--') n_on = np.sum(theta_on < theta_cut) n_off = np.sum(theta_off < theta_cut) significance = li_ma_significance(n_on, n_off, alpha=alpha) ax.text( 0.5, 0.95, stats_box_template.format( source='Crab', t_obs=83.656, n_on=n_on, n_off=n_off, alpha=alpha, n_excess=n_on - alpha * n_off, n_excess_err=np.sqrt(n_on + alpha**2 * n_off), significance=significance, ), transform=ax.transAxes, fontsize=12, va='top', ha='center', ) ax.set_xlabel(r'$(\theta / {}^\circ )^2$') ax.legend() fig.tight_layout() plt.xlim(0.0, 0.3) if output: fig.savefig(output, dpi=300) else: #plt.show() pass
def main( config, observation_file, gamma_file, corsika_file, output_file, seed, label, threshold, theta2_cut, ): ''' unfold fact data ''' setup_logging() log = logging.getLogger('fact_funfolding') log.setLevel(logging.INFO) random_state = np.random.RandomState(seed) np.random.set_state(random_state.get_state()) config = Config.from_yaml(config) e_ref = config.e_ref threshold = threshold or config.threshold theta2_cut = theta2_cut or config.theta2_cut log.info(f'Using threshold {threshold}') log.info(f'Using theta2 cut {theta2_cut}') # define binning in e_est and e_true bins_obs = logspace_binning(config.e_est_low, config.e_est_high, e_ref, config.n_bins_est) bins_true = logspace_binning(config.e_true_low, config.e_true_high, e_ref, config.n_bins_true) # read in files query = 'gamma_prediction > {} and theta_deg**2 < {}'.format( threshold, theta2_cut) log.info('Reading simulated gammas') gammas = read_h5py(gamma_file, key='events').query(query) with h5py.File(gamma_file, 'r') as f: sample_fraction = f.attrs.get('sample_fraction', 1.0) log.info('Using sampling fraction of {:.3f}'.format(sample_fraction)) query = 'gamma_prediction > {}'.format(threshold) log.info('Reading observations') observations = read_h5py(observation_file, key='events').query(query) on, off = split_on_off_source_independent(observations, theta2_cut=theta2_cut) observation_runs = read_h5py(observation_file, key='runs') obstime = observation_runs.ontime.sum() * u.s corsika_events = read_h5py( corsika_file, key='corsika_events', columns=['total_energy'], ) simulated_spectrum = read_simulated_spectrum(corsika_file) a_eff, bin_center, bin_width, a_eff_low, a_eff_high = collection_area( corsika_events.total_energy.values, gammas[E_TRUE].values, impact=simulated_spectrum['x_scatter'], bins=bins_true.to_value(u.GeV), sample_fraction=sample_fraction, ) # unfold using funfolding X_model = gammas[E_PRED].values y_model = gammas[E_TRUE].values X_data = on[E_PRED].values g_model = np.digitize(X_model, bins_obs.to(u.GeV).value) f_model = np.digitize(y_model, bins_true.to(u.GeV).value) g_data = np.digitize(X_data, bins_obs.to(u.GeV).value) model = ff.model.LinearModel(random_state=random_state) model.initialize(digitized_obs=g_model, digitized_truth=f_model) vec_g_data, _ = model.generate_vectors(digitized_obs=g_data) vec_g_model, vec_f_model = model.generate_vectors(digitized_obs=g_model, digitized_truth=f_model) if config.background: X_bg = off[E_PRED].values g_bg = np.digitize(X_bg, bins_obs.to(u.GeV).value) vec_g_bg, _ = model.generate_vectors(digitized_obs=g_bg, ) model.add_background(vec_g_bg * 0.2) llh = ff.solution.StandardLLH( tau=config.tau, log_f=True, reg_factor_f=1 / a_eff.value[1:-1] if config.tau else None, ) llh.initialize( vec_g=vec_g_data, model=model, ignore_n_bins_low=1, ignore_n_bins_high=1, ) sol_mcmc = ff.solution.LLHSolutionMCMC( n_burn_steps=config.n_burn_steps, n_used_steps=config.n_used_steps, random_state=random_state, ) sol_mcmc.initialize(llh=llh, model=model) sol_mcmc.set_x0_and_bounds(x0=np.random.poisson(vec_f_model * vec_g_data.sum() / vec_g_model.sum())) vec_f_est, sigma_vec_f, sample, probs, autocorr_time = sol_mcmc.fit() additional_features_to_save = dict() additional_features_to_save['a_eff'] = a_eff additional_features_to_save['a_eff_low'] = a_eff_low additional_features_to_save['a_eff_high'] = a_eff_high save_spectrum( output_file, bins_true, vec_f_est / a_eff / obstime / bin_width / u.GeV, sigma_vec_f / a_eff / obstime / bin_width / u.GeV, counts=vec_f_est, counts_err=sigma_vec_f, g=vec_g_data, bg=vec_g_bg, tau=config.tau, label=label or config.label, add_features=additional_features_to_save, )
def main( configuration_path, data_path, separator_model_path, energy_model_path, disp_model_path, sign_model_path, output, key, chunksize, n_jobs, yes, verbose, ): ''' Apply given model to data. Two columns are added to the file, energy_prediction and energy_prediction_std CONFIGURATION_PATH: Path to the config yaml file DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output SEPARATOR_MODEL_PATH: Path to the pickled separation model. ENERGY_MODEL_PATH: Path to the pickled energy regression model. DISP_MODEL_PATH: Path to the pickled disp model. SIGN_MODEL_PATH: Path to the pickled sign model. ''' logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) log = logging.getLogger() config = AICTConfig.from_yaml(configuration_path) if os.path.isfile(output): if not yes: click.confirm( 'Outputfile {} exists. Overwrite?'.format(output), abort=True, ) open(output, 'w').close() log.info('Loading model') separator_model = joblib.load(separator_model_path) energy_model = joblib.load(energy_model_path) disp_model = joblib.load(disp_model_path) sign_model = joblib.load(sign_model_path) log.info('Done') if n_jobs: separator_model.n_jobs = n_jobs energy_model.n_jobs = n_jobs disp_model.n_jobs = n_jobs sign_model.n_jobs = n_jobs columns = set(needed_columns) for model in ('separator', 'energy', 'disp'): model_config = getattr(config, model) columns.update(model_config.columns_to_read_apply) try: runs = read_h5py(data_path, key='runs') sources = runs['source'].unique() if len(sources) > 1: raise click.ClickException( 'to_dl3 only supports files with a single source' ) source = SkyCoord.from_name(sources[0]) columns.update(['timestamp', 'night']) except (KeyError, OSError) as e: source = None columns.update(dl3_columns_sim_read) df_generator = read_telescope_data_chunked( data_path, config, chunksize=chunksize, columns=columns, ) log.info('Predicting on data...') for df, start, end in tqdm(df_generator): df_sep = feature_generation(df, config.separator.feature_generation) df['gamma_prediction'] = predict_separator( df_sep[config.separator.features], separator_model, ) df_energy = feature_generation(df, config.energy.feature_generation) df['gamma_energy_prediction'] = predict_energy( df_energy[config.energy.features], energy_model, log_target=config.energy.log_target, ) df_disp = feature_generation(df, config.disp.feature_generation) disp = predict_disp( df_disp[config.disp.features], disp_model, sign_model ) source_x = df.cog_x + disp * np.cos(df.delta) source_y = df.cog_y + disp * np.sin(df.delta) df['source_x_prediction'] = source_x df['source_y_prediction'] = source_y df['disp_prediction'] = disp if source: obstime = Time(pd.to_datetime(df['timestamp'].values).to_pydatetime()) source_altaz = concat_results_altaz(parallelize_array_computation( partial(to_altaz, source=source), obstime, n_jobs=n_jobs, )) result = parallelize_array_computation( calc_source_features_obs, source_x, source_y, source_altaz.zen.deg, source_altaz.az.deg, df['pointing_position_zd'].values, df['pointing_position_az'].values, obstime, n_jobs=n_jobs, ) else: result = parallelize_array_computation( calc_source_features_sim, source_x, source_y, df['source_position_zd'].values, df['source_position_az'].values, df['pointing_position_zd'].values, df['pointing_position_az'].values, df['cog_x'].values, df['cog_y'].values, df['delta'].values, n_jobs=n_jobs, ) for k in result[0].keys(): df[k] = np.concatenate([r[k] for r in result]) if source: to_h5py(df[dl3_columns_obs], output, key='events', mode='a') else: to_h5py(df[dl3_columns_sim], output, key='events', mode='a') if source: log.info('Copying "runs" group') to_h5py(runs, output, key='runs', mode='a')
def main(data_path, threshold, theta2_cut, key, bins, alpha, start, end, preliminary, ymax, config, output): ''' Given the DATA_PATH to a data hdf5 file (e.g. the output of ERNAs gather scripts) this script will create the infamous theta square plot. This plot shows the events of (selected gamma-like) events which have been reconstructed as coming from the source region and the one coming from a (more or less abritrary) off region. In a traditional IACT analysis this plot is used to calculate the significance of detection. The HDF files are expected to a have a group called 'runs' and a group called 'events' The events group has to have the columns: 'theta', 'theta_deg_off_1', 'theta_deg_off_2', 'theta_deg_off_3', 'theta_deg_off_4', 'theta_deg_off_5', If a prediction threshold is to be used, also 'gamma_prediction', must be in the group. The 'gamma_prediction' column can be added to the data using 'klaas_apply_separation_model' for example. ''' if config: with open(config) as f: plot_config.update(yaml.safe_load(f)) theta_cut = np.sqrt(theta2_cut) if threshold > 0.0: columns.append('gamma_prediction') events = read_h5py(data_path, key='events', columns=columns) if start or end: events['timestamp'] = read_timestamp(data_path) try: runs = read_h5py(data_path, key='runs') runs['run_start'] = pd.to_datetime(runs['run_start']) runs['run_stop'] = pd.to_datetime(runs['run_stop']) except IOError: runs = pd.DataFrame( columns=['run_start', 'run_stop', 'ontime', 'source']) if start is not None: events = events.query('timestamp >= @start') runs = runs.query('run_start >= @start') if end is not None: events = events.query('timestamp <= @end') runs = runs.query('run_stop <= @end') if threshold > 0: selected = events.query('gamma_prediction >= {}'.format(threshold)) else: selected = events theta_on = selected.theta_deg theta_off = pd.concat( [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)]) del events max_theta2 = 0.3 width = max_theta2 / bins rounded_width = theta2_cut / np.round(theta2_cut / width) bins = np.arange(0, max_theta2 + 0.1 * rounded_width, rounded_width) print('Using {} bins to get theta_cut on a bin edge'.format(len(bins) - 1)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) h_on, bin_edges = np.histogram( theta_on.apply(lambda x: x**2).values, bins=bins, ) h_off, bin_edges, _ = ax.hist( theta_off.apply(lambda x: x**2).values, bins=bin_edges, weights=np.full(len(theta_off), 0.2), histtype='stepfilled', color='lightgray', zorder=0, ) bin_center = bin_edges[1:] - np.diff(bin_edges) * 0.5 bin_width = np.diff(bin_edges) ax.errorbar( bin_center, h_on, yerr=np.sqrt(h_on), xerr=bin_width / 2, linestyle='', label='On', ) ax.errorbar(bin_center, h_off, yerr=alpha * np.sqrt(h_off), xerr=bin_width / 2, linestyle='', label='Off', zorder=1) ax.axvline(theta_cut**2, color='black', alpha=0.3, linestyle='--') n_on = np.sum(theta_on < theta_cut) n_off = np.sum(theta_off < theta_cut) significance = li_ma_significance(n_on, n_off, alpha=alpha) print('N_on', n_on) print('N_off', n_off) print('Li&Ma: {}'.format(significance)) ax.text( 0.5, 0.95, stats_box_template.format( source=runs.source.iloc[0] if len(runs) > 0 else '', t_obs=runs.ontime.sum() / 3600, n_on=n_on, n_off=n_off, alpha=alpha, n_excess=n_on - alpha * n_off, n_excess_err=np.sqrt(n_on + alpha**2 * n_off), significance=significance, ), transform=ax.transAxes, va='top', ha='center', ) if preliminary: add_preliminary( plot_config['preliminary_position'], size=plot_config['preliminary_size'], color=plot_config['preliminary_color'], ax=ax, ) if ymax: ax.set_ylim(0, ymax) ax.set_xlim(0, bins.max()) ax.set_xlabel(plot_config['xlabel']) ax.legend(loc=plot_config['legend_loc']) fig.tight_layout(pad=0) if output: fig.savefig(output, dpi=300) else: plt.show()
from fact.io import read_h5py from fact.analysis import li_ma_significance, split_on_off_source_independent from matplotlib import pyplot as plt import pandas as pd import numpy as np crab_data = read_h5py( '/home/msackel/Desktop/gammaClassification/data/raw_data/crab_precuts.hdf5', key='events', columns=[ 'theta_deg', 'theta_deg_off_1', 'theta_deg_off_2', 'theta_deg_off_3', 'theta_deg_off_4', 'theta_deg_off_5', ]) theta_on = crab_data['theta_deg'] theta_off = pd.concat( [crab_data['theta_deg_off_' + str(i)] for i in range(1, 6)]) # plt.style.use('msackel') plt.figure(figsize=(4.5, 3.375)) plt.hist(theta_on**2, range=[0, 0.2], bins=50, histtype='step', label='On') plt.hist(theta_off**2, range=[0, 0.2], bins=50, alpha=0.6, label='Off',
from fact.io import read_h5py exec(open('/home/msackel/Desktop/gammaClassification/programm/theta_cut/theta_cut.py').read()) exec(open('/home/msackel/Desktop/gammaClassification/programm/model_significance/model_significance.py').read()) Tree = RandomForestClassifier(max_depth=15, max_features=7, criterion='entropy', n_estimators=100, n_jobs=10) with open('/home/msackel/Desktop/gammaClassification/config/feature.yaml') as f: feature = yaml.load(f) eval_data = read_h5py( '/home/msackel/Desktop/gammaClassification/data/raw_data/mrk501_2014_precuts.hdf5', key='events', columns=list(feature) + [ 'theta_deg', 'theta_deg_off_1', 'theta_deg_off_2', 'theta_deg_off_3', 'theta_deg_off_4', 'theta_deg_off_5', ] ) print('---Theta**2 = 0.5') train_data = theta_cut('/home/msackel/Desktop/gammaClassification/data/raw_data/gamma_precuts.hdf5', '/home/msackel/Desktop/gammaClassification/data/raw_data/mrk501_2014_precuts.hdf5', 0.5) Tree.fit(train_data.drop('label', axis=1), train_data.label) plot_significance(Tree, eval_data, path='plots/significance_mrk.pdf')