def get_config_flux(config): sim_config = data_config_to_sim_config(config) pipeline_str = 'BDT' pipeline = comp.get_pipeline(pipeline_str) energybins = comp.analysis.get_energybins() # Load simulation and training features df_sim_train, df_sim_test = comp.load_sim(config=sim_config, verbose=False) feature_list, feature_labels = comp.analysis.get_training_features() # Load data df_data = comp.load_data(config=config) X_data = comp.dataframe_functions.dataframe_to_array( df_data, feature_list + ['lap_log_energy']) log_energy = X_data[:, -1] X_data = X_data[:, :-1] pipeline.fit(df_sim_train[feature_list], df_sim_train['target']) data_predictions = pipeline.predict(X_data) # Get composition masks data_labels = np.array([ comp.dataframe_functions.label_to_comp(pred) for pred in data_predictions ]) data_light_mask = data_labels == 'light' data_heavy_mask = data_labels == 'heavy' # Get number of identified comp in each energy bin df_flux = {} comp_list = ['light', 'heavy'] for composition in comp_list: comp_mask = data_labels == composition df_flux['counts_' + composition] = np.histogram( log_energy[comp_mask], bins=energybins.log_energy_bins)[0] df_flux['counts_' + composition + '_err'] = np.sqrt( df_flux['counts_' + composition]) df_flux['counts_total'] = np.histogram(log_energy, bins=energybins.log_energy_bins)[0] df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total']) # Solid angle max_zenith_rad = df_sim_train['lap_zenith'].max() solid_angle = 2 * np.pi * (1 - np.cos(max_zenith_rad)) df_flux['solid_angle'] = solid_angle # Livetime livetime, livetime_err = comp.get_detector_livetime(config=config) df_flux['livetime'] = livetime df_flux['livetime_err'] = livetime_err return df_flux
choices=comp.simfunctions.get_sim_configs(), help='Detector configuration') parser.add_argument('--n_jobs', dest='n_jobs', type=int, default=10, help='Number of jobs to run in parallel') args = parser.parse_args() config = args.config energybins = comp.get_energybins(config=config) log_energy_min = energybins.log_energy_min log_energy_max = energybins.log_energy_max print('Loading full pre-processed dataset for {} into memory...'.format( config)) df_data = comp.load_data(config=config, processed=False, energy_reco=True, energy_cut_key='reco_log_energy', log_energy_min=log_energy_min, log_energy_max=log_energy_max, n_jobs=args.n_jobs, verbose=True) outfile = os.path.join(comp.paths.comp_data_dir, config, 'data', 'data_dataframe_quality_cuts.hdf') comp.check_output_dir(outfile) print('Saving processed dataset to {}...'.format(outfile)) df_data.to_hdf(outfile, 'dataframe', format='table')
response, response_err = comp.response_matrix( true_energy=log_true_energy_sim_test, reco_energy=log_reco_energy_sim_test, true_target=true_target, pred_target=pred_target, efficiencies=efficiencies, efficiencies_err=efficiencies_err, energy_bins=energybins.log_energy_bins) # Run analysis pipeline on data print('Loading data into memory...') df_data = comp.load_data(config=config, energy_reco=False, log_energy_min=None, log_energy_max=None, columns=feature_list, n_jobs=10, verbose=True) print('Running energy and composition reconstructions...') df_data['pred_comp_target'] = comp_pipeline.predict( df_data[feature_list].values) df_data['reco_log_energy'] = energy_pipeline.predict( df_data[feature_list].values) counts_observed = {} counts_observed_err = {} for idx, composition in enumerate(comp_list): # Filter out events that don't pass composition & energy mask pred_comp_mask = df_data['pred_comp_target'] == idx
df_eff = pd.read_hdf(eff_path) # Format detection efficiencies for PyUnfold use efficiencies = np.empty(num_groups * len(energybins.energy_midpoints)) efficiencies_err = np.empty(num_groups * len(energybins.energy_midpoints)) for idx, composition in enumerate(comp_list): efficiencies[idx::num_groups] = df_eff['eff_median_{}'.format( composition)] efficiencies_err[idx::num_groups] = df_eff['eff_err_low_{}'.format( composition)] # Load data DataFrame print('Loading data DataFrame...') df_data = comp.load_data(config=config, columns=feature_list, energy_cut_key='reco_log_energy', log_energy_min=log_energy_min, log_energy_max=log_energy_max, n_jobs=n_jobs, verbose=True) X_data = comp.io.dataframe_to_array(df_data, feature_list + ['reco_log_energy']) log_energy_data = X_data[:, -1] X_data = X_data[:, :-1] print('Making composition predictions on data...') # Apply pipeline.predict method in chunks for parallel predicting X_da = da.from_array(X_data, chunks=(len(X_data) // 100, X_data.shape[1])) data_predictions = da.map_blocks(pipeline.predict, X_da, dtype=int,
def save_data_MC_plots(config, june_july_only): df_sim = comp.load_sim(config='IC86.2012', test_size=0, verbose=False) # energy_mask_sim = (df_sim['lap_log_energy'] > 6.0) # energy_mask_sim = (df_sim['lap_log_energy'] > 6.4) & (df_sim['lap_log_energy'] < 8.0) # df_sim = df_sim[energy_mask_sim] df_data = comp.load_data(config=config, verbose=False) df_data = df_data[np.isfinite(df_data['log_dEdX'])] # energy_mask_data = (df_data['lap_log_energy'] > 6.4) & (df_data['lap_log_energy'] < 8.0) # df_data = df_data[energy_mask_data] if june_july_only: print('Masking out all data events not in June or July') def is_june_july(time): i3_time = dataclasses.I3Time(time) return i3_time.date_time.month in [6, 7] june_july_mask = df_data.end_time_mjd.apply(is_june_july) df_data = df_data[june_july_mask].reset_index(drop=True) months = (6, 7) if june_july_only else None livetime, livetime_err = comp.get_detector_livetime(config, months=months) weights = get_sim_weights(df_sim) df_sim['weights'] = flux(df_sim['MC_energy']) * weights MC_comp_mask = {} comp_list = ['PPlus', 'Fe56Nucleus'] for composition in comp_list: MC_comp_mask[composition] = df_sim['MC_comp'] == composition # MC_comp_mask[composition] = df_sim['MC_comp_class'] == composition # S125 data-MC plot log_s125_bins = np.linspace(-0.5, 3.5, 50) gs_s125 = plot_data_MC_comparison(df_sim, df_data, 'log_s125', log_s125_bins, '$\mathrm{\log_{10}(S_{125})}$', livetime, ylim_ratio=(0, 2)) s125_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison', 's125_{}.png'.format(config)) plt.savefig(s125_outfile) # dE/dX data-MC plot log_dEdX_bins = np.linspace(-2, 4, 50) gs_dEdX = plot_data_MC_comparison(df_sim, df_data, 'log_dEdX', log_dEdX_bins, '$\mathrm{\log_{10}(dE/dX)}$', livetime, ylim_ratio=(0, 5.5)) dEdX_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison', 'dEdX_{}.png'.format(config)) plt.savefig(dEdX_outfile) # cos(zenith) data-MC plot cos_zenith_bins = np.linspace(0.8, 1.0, 50) gs_zenith = plot_data_MC_comparison(df_sim, df_data, 'lap_cos_zenith', cos_zenith_bins, '$\mathrm{\cos(\\theta_{reco})}$', livetime, ylim_ratio=(0, 3)) zenith_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison', 'zenith_{}.png'.format(config)) plt.savefig(zenith_outfile) # InIce median radius data-MC plot inice_radius_bins = np.linspace(0, 200, 50) gs_inice_radius = plot_data_MC_comparison( df_sim, df_data, 'median_inice_radius', inice_radius_bins, '$\mathrm{\cos(\\theta_{reco})}$', livetime, ylim_ratio=(0, 3)) inice_radius_outfile = os.path.join( comp.paths.figures_dir, 'data-MC-comparison', 'median_inice_radius_{}.png'.format(config)) plt.savefig(inice_radius_outfile) # log_d4r_peak_energy data-MC plot log_d4r_peak_energy_bins = np.linspace(-0.5, 3.5, 50) gs_d4R_peak_energy = plot_data_MC_comparison( df_sim, df_data, 'log_d4r_peak_energy', log_d4r_peak_energy_bins, '$\mathrm{\log_{10}(E_{D4R}/GeV)}$', livetime, ylim_ratio=(0, 5.5)) d4R_peak_energy_outfile = os.path.join( comp.paths.figures_dir, 'data-MC-comparison', 'd4R_peak_energy_{}.png'.format(config)) plt.savefig(d4R_peak_energy_outfile) # log_d4r_peak_sigma data-MC plot log_d4r_peak_sigma_bins = np.linspace(-1, 3, 50) gs_d4R_peak_sigma = plot_data_MC_comparison( df_sim, df_data, 'log_d4r_peak_sigma', log_d4r_peak_sigma_bins, '$\mathrm{\log_{10}(E_{D4R}/GeV)}$', livetime, ylim_ratio=(0, 5.5)) d4R_peak_sigma_outfile = os.path.join( comp.paths.figures_dir, 'data-MC-comparison', 'd4R_peak_sigma_{}.png'.format(config)) plt.savefig(d4R_peak_sigma_outfile)