def get_config_flux(config): sim_config = data_config_to_sim_config(config) pipeline_str = 'BDT' pipeline = comp.get_pipeline(pipeline_str) energybins = comp.analysis.get_energybins() # Load simulation and training features df_sim_train, df_sim_test = comp.load_sim(config=sim_config, verbose=False) feature_list, feature_labels = comp.analysis.get_training_features() # Load data df_data = comp.load_data(config=config) X_data = comp.dataframe_functions.dataframe_to_array( df_data, feature_list + ['lap_log_energy']) log_energy = X_data[:, -1] X_data = X_data[:, :-1] pipeline.fit(df_sim_train[feature_list], df_sim_train['target']) data_predictions = pipeline.predict(X_data) # Get composition masks data_labels = np.array([ comp.dataframe_functions.label_to_comp(pred) for pred in data_predictions ]) data_light_mask = data_labels == 'light' data_heavy_mask = data_labels == 'heavy' # Get number of identified comp in each energy bin df_flux = {} comp_list = ['light', 'heavy'] for composition in comp_list: comp_mask = data_labels == composition df_flux['counts_' + composition] = np.histogram( log_energy[comp_mask], bins=energybins.log_energy_bins)[0] df_flux['counts_' + composition + '_err'] = np.sqrt( df_flux['counts_' + composition]) df_flux['counts_total'] = np.histogram(log_energy, bins=energybins.log_energy_bins)[0] df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total']) # Solid angle max_zenith_rad = df_sim_train['lap_zenith'].max() solid_angle = 2 * np.pi * (1 - np.cos(max_zenith_rad)) df_flux['solid_angle'] = solid_angle # Livetime livetime, livetime_err = comp.get_detector_livetime(config=config) df_flux['livetime'] = livetime df_flux['livetime_err'] = livetime_err return df_flux
config = args.config num_groups = args.num_groups p = args.prob_correct comp_list = comp.get_comp_list(num_groups=num_groups) energybins = comp.get_energybins(config) num_ebins = len(energybins.log_energy_midpoints) data_dir = os.path.join(comp.paths.comp_data_dir, config, 'unfolding', 'datachallenge') # Load simulation and train composition classifier df_sim_train, df_sim_test = comp.load_sim(config=config, energy_reco=False, log_energy_min=None, log_energy_max=None, test_size=0.5, verbose=True) feature_list, feature_labels = comp.get_training_features() print('Loading energy regressor...') energy_pipeline = comp.load_trained_model( 'linearregression_energy_{}'.format(config)) # energy_pipeline = comp.load_trained_model('RF_energy_{}'.format(config)) for df in [df_sim_train, df_sim_test]: df['reco_log_energy'] = energy_pipeline.predict( df[feature_list].values) df['reco_energy'] = 10**df['reco_log_energy'] print('Loading or fitting composition classifier...')
'gridsearch. Ignored if gridsearch=False.') args = parser.parse_args() config = args.config num_groups = args.num_groups comp_list = comp.get_comp_list(num_groups=num_groups) energybins = comp.get_energybins(config=config) log_energy_min = energybins.log_energy_min log_energy_max = energybins.log_energy_max # Load training data and fit model df_sim_train, df_sim_test = comp.load_sim( config=config, energy_reco=False, log_energy_min=None, log_energy_max=None, # log_energy_min=log_energy_min, # log_energy_max=log_energy_max, test_size=0.5) feature_list, feature_labels = comp.get_training_features() X_train = df_sim_train[feature_list].values y_train = df_sim_train['comp_target_{}'.format(num_groups)].values # Load untrained model pipeline_str = '{}_comp_{}_{}-groups'.format(args.pipeline, config, num_groups) pipeline = comp.get_pipeline(pipeline_str) if args.gridsearch: param_grid = comp.get_param_grid(pipeline_name=pipeline_str) pipeline = comp.gridsearch_optimize(pipeline=pipeline,
from keras.models import Sequential from keras.layers import Dense, Dropout from keras.utils import to_categorical from sklearn.model_selection import StratifiedKFold, KFold, train_test_split import comptools as comp color_dict = comp.get_color_dict() config = 'IC79.2010' num_groups = 4 comp_list = comp.get_comp_list(num_groups) energybins = comp.get_energybins(config=config) df_sim_train, df_sim_test = comp.load_sim(config=config, test_size=0.5, log_energy_min=energybins.log_energy_min, log_energy_max=energybins.log_energy_max, verbose=True) ldf_cols = [col for col in df_sim_train.columns if 'ldf' in col] isnull_mask_train = df_sim_train[ldf_cols].isnull().sum(axis=1).astype(bool) isnull_mask_test = df_sim_test[ldf_cols].isnull().sum(axis=1).astype(bool) zero_ldf = df_sim_train[ldf_cols].sum(axis=1) == 0 X_train = df_sim_train.loc[~isnull_mask_train, ldf_cols].values X_train = X_train / X_train.sum(axis=1)[:, None] y_train = df_sim_train.loc[~isnull_mask_train, f'comp_target_{num_groups}'].values X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=2) X_test = df_sim_test.loc[~isnull_mask_test, ldf_cols].values
def fit_efficiencies(df_file=None, config='IC86.2012', num_groups=2, sigmoid='slant', n_samples=1000): print('Loading df_file: {}'.format(df_file)) comp_list = comp.get_comp_list(num_groups=num_groups) energybins = comp.get_energybins(config=config) # Want to include energy bins for energies below the normal analysis energy # range so we can get a better estimate of how the detector efficiencies turn on low_energy_bins = np.arange(5.0, energybins.log_energy_min, 0.1) bins = np.concatenate((low_energy_bins, energybins.log_energy_bins)) bin_midpoints = (bins[1:] + bins[:-1]) / 2 df_sim = comp.load_sim(df_file=df_file, config=config, test_size=0, log_energy_min=None, log_energy_max=None) # Thrown areas are different for different energy bin thrown_radii = comp.simfunctions.get_sim_thrown_radius(bin_midpoints) thrown_areas = np.pi * thrown_radii**2 thrown_areas_max = thrown_areas.max() # Calculate efficiencies and effective areas for each composition group efficiencies = pd.DataFrame() effective_area, effective_area_err = {}, {} for composition in comp_list + ['total']: compositions = df_sim['comp_group_{}'.format(num_groups)] # Need list of simulation sets for composition to get number of thrown showers if composition == 'total': comp_mask = np.full_like(compositions, True) else: comp_mask = compositions == composition sim_list = df_sim.loc[comp_mask, 'sim'].unique() thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=bins) print('thrown_showers ({}) = {}'.format(composition, thrown_showers)) passed_showers = np.histogram(df_sim.loc[comp_mask, 'MC_log_energy'], bins=bins)[0] efficiency, efficiency_err = comp.ratio_error( num=passed_showers, num_err=np.sqrt(passed_showers), den=thrown_showers, den_err=np.sqrt(thrown_showers)) # Calculate effective area from efficiencies and thrown areas effective_area[composition] = efficiency * thrown_areas effective_area_err[composition] = efficiency_err * thrown_areas # Scale efficiencies by geometric factor to take into account # different simulated thrown radii thrown_radius_factor = thrown_areas / thrown_areas_max efficiencies['eff_{}'.format( composition)] = efficiency * thrown_radius_factor efficiencies['eff_err_{}'.format( composition)] = efficiency_err * thrown_radius_factor # Fit sigmoid function to efficiency vs. energy distribution # fit_func = sigmoid_flat if sigmoid == 'flat' else sigmoid_slant poly_degree = 1 num_params = poly_degree + 3 fit_func = generate_fit_func(degree=poly_degree) # p0 = [7e4, 8.0, 50.0] if sigmoid == 'flat' else [7e4, 8.5, 50.0, 800] init_params = [8.5, 50.0, 7e4, 800] p0 = np.empty(num_params) p0[:min(num_params, len(init_params))] = init_params[:num_params] efficiencies_fit = {} energy_min_fit, energy_max_fit = 5.8, energybins.log_energy_max midpoints_fitmask = np.logical_and(bin_midpoints > energy_min_fit, bin_midpoints < energy_max_fit) # Find best-fit sigmoid function for composition in comp_list + ['total']: eff = efficiencies.loc[midpoints_fitmask, 'eff_{}'.format(composition)] eff_err = efficiencies.loc[midpoints_fitmask, 'eff_err_{}'.format(composition)] popt, pcov = curve_fit(fit_func, bin_midpoints[midpoints_fitmask], eff, p0=p0, sigma=eff_err) eff_fit = fit_func(bin_midpoints, *popt) efficiencies_fit[composition] = eff_fit chi2 = np.sum((eff - eff_fit[midpoints_fitmask])**2 / (eff_err)**2) ndof = len(eff_fit[midpoints_fitmask]) - len(p0) print('({}) chi2 / ndof = {} / {} = {}'.format(composition, chi2, ndof, chi2 / ndof)) # Perform many fits to random statistical fluxuations of the best fit efficiency # This will be used to estimate the uncertainty in the best fit efficiency np.random.seed(2) efficiencies_fit_samples = defaultdict(list) for _ in xrange(n_samples): for composition in comp_list + ['total']: # Get new random sample to fit eff_err = efficiencies.loc[midpoints_fitmask, 'eff_err_{}'.format(composition)] eff_sample = np.random.normal( efficiencies_fit[composition][midpoints_fitmask], eff_err) # Fit with error bars popt, pcov = curve_fit(fit_func, bin_midpoints[midpoints_fitmask], eff_sample, p0=p0, sigma=eff_err) eff_fit_sample = fit_func(bin_midpoints, *popt) efficiencies_fit_samples[composition].append(eff_fit_sample) # Calculate median and error of efficiency fits eff_fit = pd.DataFrame() for composition in comp_list + ['total']: fit_median, fit_err_low, fit_err_high = np.percentile( efficiencies_fit_samples[composition], (50, 16, 84), axis=0) fit_err_low = np.abs(fit_err_low - fit_median) fit_err_high = np.abs(fit_err_high - fit_median) eff_fit['eff_median_{}'.format(composition)] = fit_median eff_fit['eff_err_low_{}'.format(composition)] = fit_err_low eff_fit['eff_err_high_{}'.format(composition)] = fit_err_high return efficiencies.loc[midpoints_fitmask, :], eff_fit
parser = argparse.ArgumentParser( description= 'Extracts and saves desired information from simulation/data .i3 files' ) parser.add_argument('-c', '--config', dest='config', nargs='*', choices=comp.simfunctions.get_sim_configs(), help='Detector configuration') args = parser.parse_args() for config in args.config: df_sim = comp.load_sim(config=config, test_size=0) comp_list = ['light', 'heavy'] MC_comp_mask = {} for composition in comp_list: MC_comp_mask[composition] = df_sim['MC_comp_class'] == composition light_mask = df_sim['MC_comp_class'] == 'light' heavy_mask = df_sim['MC_comp_class'] == 'heavy' energybins = comp.analysis.get_energybins() # Energy resolution energy_res = np.log10(df_sim['lap_energy'] / df_sim['MC_energy']) medians_light, stds_light, _ = comp.analysis.get_median_std( df_sim['MC_log_energy'][light_mask], energy_res[light_mask],
help='Energy that should be used.') args = parser.parse_args() config = args.config num_groups = args.num_groups n_splits = args.n_splits n_jobs = args.n_jobs energy_key = 'MC_log_energy' if args.energy == 'MC' else 'reco_log_energy' energybins = comp.get_energybins(config) comp_list = comp.get_comp_list(num_groups=num_groups) feature_list, feature_labels = comp.get_training_features() pipeline_str = 'BDT_comp_{}_{}-groups'.format(config, num_groups) df_train, df_test = comp.load_sim(config=config, log_energy_min=energybins.log_energy_min, log_energy_max=energybins.log_energy_max, test_size=0.5) skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2) folds = [] for train_index, test_index in skf.split( df_train, df_train['comp_target_{}'.format(num_groups)]): df_train_fold = df_train.iloc[train_index] df_test_fold = df_train.iloc[test_index] frac_correct = get_frac_correct(df_train_fold, df_test_fold, pipeline_str=pipeline_str, num_groups=num_groups, energy_key=energy_key) folds.append(frac_correct)
help='Number of jobs to run in parallel') args = parser.parse_args() config = args.config n_jobs = args.n_jobs energybins = comp.get_energybins(config=config) log_energy_min = energybins.log_energy_min log_energy_max = energybins.log_energy_max feature_list, feature_labels = comp.get_training_features() print('Loading full non-processed dataset for {} into memory...'.format( config)) ddf = comp.load_sim( config=config, # processed=False, test_size=0, energy_reco=False, log_energy_min=None, log_energy_max=None, compute=False) # ddf = comp.load_data(config=config, # processed=False, # energy_reco=False, # log_energy_min=None, # log_energy_max=None, # compute=False) # Energy reconstruction model energy_pipeline = comp.load_trained_model( 'linearregression_energy_{}'.format(config), return_metadata=False)
default=10, type=int, help='Number CV folds to run') parser.add_argument('--n_jobs', dest='n_jobs', default=20, type=int, help='Number of jobs to run in parallel') args = parser.parse_args() comp_list = comp.get_comp_list(num_groups=args.num_groups) energybins = comp.get_energybins(args.config) # Load simulation data and pipeline df_sim_train, df_sim_test = comp.load_sim( config=args.config, log_energy_min=energybins.log_energy_min, log_energy_max=energybins.log_energy_max) feature_list, feature_labels = comp.get_training_features() # pipeline_str = 'LinearSVC_comp_{}_{}-groups'.format(args.config, args.num_groups) # pipeline_str = 'BDT_comp_{}_{}-groups'.format(args.config, args.num_groups) pipeline_str = '{}_comp_{}_{}-groups'.format(args.pipeline, args.config, args.num_groups) pipeline = comp.get_pipeline(pipeline_str) # Get learning curve scores X = df_sim_train[feature_list] y = df_sim_train['comp_target_{}'.format(args.num_groups)] train_sizes = np.linspace(0.1, 1.0, 10) train_sizes, train_scores, test_scores = learning_curve( estimator=pipeline,
if __name__ == '__main__': description = 'Makes performance plots for IceTop Laputop reconstruction' parser = argparse.ArgumentParser(description=description) parser.add_argument('-c', '--config', dest='config', nargs='*', choices=comp.simfunctions.get_sim_configs(), help='Detector configuration') args = parser.parse_args() for config in args.config: df_sim = comp.load_sim(config=config, test_size=0, verbose=True) comp_list = ['light', 'heavy'] MC_comp_mask = {} for composition in comp_list: MC_comp_mask[composition] = df_sim['MC_comp_class'] == composition light_mask = df_sim['MC_comp_class'] == 'light' heavy_mask = df_sim['MC_comp_class'] == 'heavy' energybins = comp.get_energybins() # Energy resolution energy_res = np.log10(df_sim['lap_energy'] / df_sim['MC_energy']) medians_light, stds_light, _ = comp.data_functions.get_median_std( df_sim['MC_log_energy'][light_mask], energy_res[light_mask],
import comptools.analysis.plotting as plotting color_dict = comp.analysis.get_color_dict() if __name__ == '__main__': parser = argparse.ArgumentParser( description='Makes and saves feature importance plot') parser.add_argument('-c', '--config', dest='config', choices=comp.simfunctions.get_sim_configs(), help='Detector configuration') args = parser.parse_args() df_sim_train, df_sim_test = comp.load_sim(config=args.config) pipeline_str = 'BDT' pipeline = comp.get_pipeline(pipeline_str) feature_list, feature_labels = comp.analysis.get_training_features() pipeline.fit(df_sim_train[feature_list], df_sim_train['target']) num_features = len(feature_list) importances = pipeline.named_steps['classifier'].feature_importances_ indices = np.argsort(importances)[::-1] for f in range(num_features): print('{}) {}'.format(f + 1, importances[indices[f]])) # Make feature importance plot fig, ax = plt.subplots()
def save_data_MC_plots(config, june_july_only): df_sim = comp.load_sim(config='IC86.2012', test_size=0, verbose=False) # energy_mask_sim = (df_sim['lap_log_energy'] > 6.0) # energy_mask_sim = (df_sim['lap_log_energy'] > 6.4) & (df_sim['lap_log_energy'] < 8.0) # df_sim = df_sim[energy_mask_sim] df_data = comp.load_data(config=config, verbose=False) df_data = df_data[np.isfinite(df_data['log_dEdX'])] # energy_mask_data = (df_data['lap_log_energy'] > 6.4) & (df_data['lap_log_energy'] < 8.0) # df_data = df_data[energy_mask_data] if june_july_only: print('Masking out all data events not in June or July') def is_june_july(time): i3_time = dataclasses.I3Time(time) return i3_time.date_time.month in [6, 7] june_july_mask = df_data.end_time_mjd.apply(is_june_july) df_data = df_data[june_july_mask].reset_index(drop=True) months = (6, 7) if june_july_only else None livetime, livetime_err = comp.get_detector_livetime(config, months=months) weights = get_sim_weights(df_sim) df_sim['weights'] = flux(df_sim['MC_energy']) * weights MC_comp_mask = {} comp_list = ['PPlus', 'Fe56Nucleus'] for composition in comp_list: MC_comp_mask[composition] = df_sim['MC_comp'] == composition # MC_comp_mask[composition] = df_sim['MC_comp_class'] == composition # S125 data-MC plot log_s125_bins = np.linspace(-0.5, 3.5, 50) gs_s125 = plot_data_MC_comparison(df_sim, df_data, 'log_s125', log_s125_bins, '$\mathrm{\log_{10}(S_{125})}$', livetime, ylim_ratio=(0, 2)) s125_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison', 's125_{}.png'.format(config)) plt.savefig(s125_outfile) # dE/dX data-MC plot log_dEdX_bins = np.linspace(-2, 4, 50) gs_dEdX = plot_data_MC_comparison(df_sim, df_data, 'log_dEdX', log_dEdX_bins, '$\mathrm{\log_{10}(dE/dX)}$', livetime, ylim_ratio=(0, 5.5)) dEdX_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison', 'dEdX_{}.png'.format(config)) plt.savefig(dEdX_outfile) # cos(zenith) data-MC plot cos_zenith_bins = np.linspace(0.8, 1.0, 50) gs_zenith = plot_data_MC_comparison(df_sim, df_data, 'lap_cos_zenith', cos_zenith_bins, '$\mathrm{\cos(\\theta_{reco})}$', livetime, ylim_ratio=(0, 3)) zenith_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison', 'zenith_{}.png'.format(config)) plt.savefig(zenith_outfile) # InIce median radius data-MC plot inice_radius_bins = np.linspace(0, 200, 50) gs_inice_radius = plot_data_MC_comparison( df_sim, df_data, 'median_inice_radius', inice_radius_bins, '$\mathrm{\cos(\\theta_{reco})}$', livetime, ylim_ratio=(0, 3)) inice_radius_outfile = os.path.join( comp.paths.figures_dir, 'data-MC-comparison', 'median_inice_radius_{}.png'.format(config)) plt.savefig(inice_radius_outfile) # log_d4r_peak_energy data-MC plot log_d4r_peak_energy_bins = np.linspace(-0.5, 3.5, 50) gs_d4R_peak_energy = plot_data_MC_comparison( df_sim, df_data, 'log_d4r_peak_energy', log_d4r_peak_energy_bins, '$\mathrm{\log_{10}(E_{D4R}/GeV)}$', livetime, ylim_ratio=(0, 5.5)) d4R_peak_energy_outfile = os.path.join( comp.paths.figures_dir, 'data-MC-comparison', 'd4R_peak_energy_{}.png'.format(config)) plt.savefig(d4R_peak_energy_outfile) # log_d4r_peak_sigma data-MC plot log_d4r_peak_sigma_bins = np.linspace(-1, 3, 50) gs_d4R_peak_sigma = plot_data_MC_comparison( df_sim, df_data, 'log_d4r_peak_sigma', log_d4r_peak_sigma_bins, '$\mathrm{\log_{10}(E_{D4R}/GeV)}$', livetime, ylim_ratio=(0, 5.5)) d4R_peak_sigma_outfile = os.path.join( comp.paths.figures_dir, 'data-MC-comparison', 'd4R_peak_sigma_{}.png'.format(config)) plt.savefig(d4R_peak_sigma_outfile)