def get_composition_pipeline(pipeline, p, use_sample_weights): if p is None: pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config, num_groups) pipeline = comp.load_trained_model(pipeline_str) if use_sample_weights is not None: model = use_sample_weights pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config, num_groups) pipeline = comp.get_pipeline(pipeline_str) compositions = df_sim_train['comp_group_{}'.format(num_groups)].values energies = df_sim_train['reco_energy'].values sample_weight = calculate_sample_weights(compositions, energies, model=model) X = df_sim_train[feature_list].values y = df_sim_train['comp_target_{}'.format(num_groups)].values fit_params = {'classifier__sample_weight': sample_weight} pipeline.fit(X, y, **fit_params) return pipeline
data_dir = os.path.join(comp.paths.comp_data_dir, config, 'unfolding', 'datachallenge') # Load simulation and train composition classifier df_sim_train, df_sim_test = comp.load_sim(config=config, energy_reco=False, log_energy_min=None, log_energy_max=None, test_size=0.5, verbose=True) feature_list, feature_labels = comp.get_training_features() print('Loading energy regressor...') energy_pipeline = comp.load_trained_model( 'linearregression_energy_{}'.format(config)) # energy_pipeline = comp.load_trained_model('RF_energy_{}'.format(config)) for df in [df_sim_train, df_sim_test]: df['reco_log_energy'] = energy_pipeline.predict( df[feature_list].values) df['reco_energy'] = 10**df['reco_log_energy'] print('Loading or fitting composition classifier...') if any([ args.weights_model, args.energy_spectrum_weights, args.compositon_weights ]): model = args.weights_model energy_spectrum_weights = args.energy_spectrum_weights compositon_weights = args.compositon_weights
def get_classified_fractions(df_train, df_test, pipeline_str=None, num_groups=4, energy_key='MC_log_energy'): '''Calculates the fraction of correctly identified samples in each energy bin for each composition in comp_list. In addition, the statisitcal error for the fraction correctly identified is calculated.''' # Input validation if energy_key not in ['MC_log_energy', 'reco_log_energy']: raise ValueError( "Invalid energy_key ({}) entered. Must be either " "'MC_log_energy' or 'reco_log_energy'.".format(energy_key)) if pipeline_str is None: pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups) # Fit pipeline and get mask for correctly identified events feature_list, feature_labels = comp.get_training_features() if 'CustomClassifier' in pipeline_str: pipeline = comp.get_pipeline(pipeline_str) else: pipeline = comp.load_trained_model(pipeline_str) comp_target_str = 'comp_target_{}'.format(num_groups) if 'CustomClassifier' in pipeline_str: test_predictions = pipeline.predict( df_test['comp_target_{}'.format(num_groups)]) else: test_predictions = pipeline.predict(df_test[feature_list]) pred_comp = np.array( comp.decode_composition_groups(test_predictions, num_groups=num_groups)) data = {} for true_composition, identified_composition in product( comp_list, comp_list): true_comp_mask = df_test['comp_group_{}'.format( num_groups)] == true_composition ident_comp_mask = pred_comp == identified_composition # Get number of MC comp in each energy bin num_true_comp, _ = np.histogram(df_test.loc[true_comp_mask, energy_key], bins=energybins.log_energy_bins) num_true_comp_err = np.sqrt(num_true_comp) # Get number of correctly identified comp in each energy bin combined_mask = true_comp_mask & ident_comp_mask num_identified_comp, _ = np.histogram(df_test.loc[combined_mask, energy_key], bins=energybins.log_energy_bins) num_identified_comp_err = np.sqrt(num_identified_comp) # Calculate correctly identified fractions as a function of energy frac_identified, frac_identified_err = comp.ratio_error( num_identified_comp, num_identified_comp_err, num_true_comp, num_true_comp_err) data['true_{}_identified_{}'.format( true_composition, identified_composition)] = frac_identified data['true_{}_identified_{}_err'.format( true_composition, identified_composition)] = frac_identified_err return data
'unfolding-df_{}-groups.hdf'.format(num_groups)) df = pd.read_hdf(df_file) # Load simulation and train composition classifier df_sim_train, df_sim_test = comp.load_sim(config=config, energy_reco=False, log_energy_min=None, log_energy_max=None, test_size=0.5, n_jobs=10, verbose=True) feature_list, feature_labels = comp.get_training_features() print('Running energy reconstruction...') energy_pipeline = comp.load_trained_model('RF_energy_{}'.format(config)) for df in [df_sim_train, df_sim_test]: X = df_sim_train[feature_list].values # Energy reconstruction df['reco_log_energy'] = energy_pipeline.predict( df[feature_list].values) df['reco_energy'] = 10**df['reco_log_energy'] efficiencies, efficiencies_err = comp.get_detector_efficiencies( config=config, num_groups=num_groups, sigmoid='slant', pyunfold_format=True) print('Running composition classifications...') pipeline_str = 'xgboost_comp_{}_{}-groups'.format(config, num_groups)
# processed=False, test_size=0, energy_reco=False, log_energy_min=None, log_energy_max=None, compute=False) # ddf = comp.load_data(config=config, # processed=False, # energy_reco=False, # log_energy_min=None, # log_energy_max=None, # compute=False) # Energy reconstruction model energy_pipeline = comp.load_trained_model( 'linearregression_energy_{}'.format(config), return_metadata=False) for shift_type in ['up', 'down']: print('Processing VEM calibration {} shifted dataset...'.format( shift_type)) s125_scaling_factor = 1.03 if shift_type == 'up' else 0.97 # Process data: # - Shift S125 value to account for VEM calibration systematic uncertainty # - Energy reconstruction # - Energy range cut ddf_systematic = (ddf.assign( lap_s125=s125_scaling_factor * ddf.lap_s125).assign( log_s125=lambda x: da.log10(x.lap_s125)).map_partitions( add_reco_energy, energy_pipeline, feature_list).map_partitions(apply_energy_cut,