コード例 #1
0
def calculate_ratio(flux, flux_err_stat, flux_err_sys, true_flux,
                    true_flux_err_stat, true_flux_err_sys):

    diff = flux - true_flux
    # Error bar calculation
    diff_err_sys = np.sqrt(flux_err_sys**2 + true_flux_err_sys**2)
    diff_err_stat = np.sqrt(flux_err_stat**2 + true_flux_err_stat**2)

    frac_diff, frac_diff_sys = comp.ratio_error(diff, diff_err_sys, true_flux,
                                                true_flux_err_sys)
    frac_diff, frac_diff_stat = comp.ratio_error(diff, diff_err_stat,
                                                 true_flux, true_flux_err_stat)

    return frac_diff, frac_diff_stat, frac_diff_sys
コード例 #2
0
def get_frac_correct(df_train,
                     df_test,
                     pipeline_str=None,
                     num_groups=4,
                     energy_key='MC_log_energy'):
    '''Calculates the fraction of correctly identified samples in each energy bin
    for each composition in comp_list. In addition, the statisitcal error for the
    fraction correctly identified is calculated.'''

    # Input validation
    if energy_key not in ['MC_log_energy', 'reco_log_energy']:
        raise ValueError(
            "Invalid energy_key ({}) entered. Must be either "
            "'MC_log_energy' or 'reco_log_energy'.".format(energy_key))

    if pipeline_str is None:
        pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups)

    # Fit pipeline and get mask for correctly identified events
    feature_list, feature_labels = comp.get_training_features()
    pipeline = comp.get_pipeline(pipeline_str)
    comp_target_str = 'comp_target_{}'.format(num_groups)
    pipeline.fit(df_train[feature_list], df_train[comp_target_str])

    test_predictions = pipeline.predict(df_test[feature_list])
    correctly_identified_mask = (test_predictions == df_test[comp_target_str])

    data = {}
    for composition in comp_list + ['total']:
        comp_mask = df_test['comp_group_{}'.format(num_groups)] == composition
        # Get number of MC comp in each energy bin
        num_MC_energy, _ = np.histogram(df_test.loc[comp_mask, energy_key],
                                        bins=energybins.log_energy_bins)
        num_MC_energy_err = np.sqrt(num_MC_energy)

        # Get number of correctly identified comp in each energy bin
        combined_mask = comp_mask & correctly_identified_mask
        num_reco_energy, _ = np.histogram(df_test.loc[combined_mask,
                                                      energy_key],
                                          bins=energybins.log_energy_bins)
        num_reco_energy_err = np.sqrt(num_reco_energy)

        # Calculate correctly identified fractions as a function of energy
        frac_correct, frac_correct_err = comp.ratio_error(
            num_reco_energy, num_reco_energy_err, num_MC_energy,
            num_MC_energy_err)
        data['frac_correct_{}'.format(composition)] = frac_correct
        data['frac_correct_err_{}'.format(composition)] = frac_correct_err

    return data
コード例 #3
0
                    'livetime']
                rate_2012 = df_flux_2012['counts_total'] / df_flux_2012[
                    'livetime']
                ratio[config] = rate[6] / rate_2012[6]
        else:
            ratio = {config: 1.0 for config in args.config}

        print(ratio)

        # Plot rate for each year on single plot
        fig, ax = plt.subplots()
        for composition in comp_list + ['total']:
            for config in args.config:
                df_flux_config = df_flux.loc[config]
                rate, rate_err = comp.ratio_error(
                    df_flux_config['counts_' + composition],
                    np.sqrt(df_flux_config['counts_' + composition]),
                    df_flux_config['livetime'], df_flux_config['livetime_err'])
                plotting.plot_steps(energybins.log_energy_bins,
                                    rate,
                                    yerr=rate_err,
                                    ax=ax,
                                    color=df_flux_config[composition +
                                                         '_color'],
                                    label=config + ' ' + composition)
        ax.set_yscale("log", nonposy='clip')
        ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$')
        ax.set_ylabel('Rate $\mathrm{[s^{-1}]}$')
        ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
        # ax.set_ylim([10**3, 10**5])
        ax.grid(linestyle='dotted', which="both")
コード例 #4
0
def fit_efficiencies(df_file=None,
                     config='IC86.2012',
                     num_groups=2,
                     sigmoid='slant',
                     n_samples=1000):
    print('Loading df_file: {}'.format(df_file))

    comp_list = comp.get_comp_list(num_groups=num_groups)

    energybins = comp.get_energybins(config=config)
    # Want to include energy bins for energies below the normal analysis energy
    # range so we can get a better estimate of how the detector efficiencies turn on
    low_energy_bins = np.arange(5.0, energybins.log_energy_min, 0.1)
    bins = np.concatenate((low_energy_bins, energybins.log_energy_bins))
    bin_midpoints = (bins[1:] + bins[:-1]) / 2

    df_sim = comp.load_sim(df_file=df_file,
                           config=config,
                           test_size=0,
                           log_energy_min=None,
                           log_energy_max=None)

    # Thrown areas are different for different energy bin
    thrown_radii = comp.simfunctions.get_sim_thrown_radius(bin_midpoints)
    thrown_areas = np.pi * thrown_radii**2
    thrown_areas_max = thrown_areas.max()

    # Calculate efficiencies and effective areas for each composition group
    efficiencies = pd.DataFrame()
    effective_area, effective_area_err = {}, {}
    for composition in comp_list + ['total']:
        compositions = df_sim['comp_group_{}'.format(num_groups)]
        # Need list of simulation sets for composition to get number of thrown showers
        if composition == 'total':
            comp_mask = np.full_like(compositions, True)
        else:
            comp_mask = compositions == composition
        sim_list = df_sim.loc[comp_mask, 'sim'].unique()
        thrown_showers = thrown_showers_per_ebin(sim_list,
                                                 log_energy_bins=bins)
        print('thrown_showers ({}) = {}'.format(composition, thrown_showers))
        passed_showers = np.histogram(df_sim.loc[comp_mask, 'MC_log_energy'],
                                      bins=bins)[0]

        efficiency, efficiency_err = comp.ratio_error(
            num=passed_showers,
            num_err=np.sqrt(passed_showers),
            den=thrown_showers,
            den_err=np.sqrt(thrown_showers))

        # Calculate effective area from efficiencies and thrown areas
        effective_area[composition] = efficiency * thrown_areas
        effective_area_err[composition] = efficiency_err * thrown_areas

        # Scale efficiencies by geometric factor to take into account
        # different simulated thrown radii
        thrown_radius_factor = thrown_areas / thrown_areas_max
        efficiencies['eff_{}'.format(
            composition)] = efficiency * thrown_radius_factor
        efficiencies['eff_err_{}'.format(
            composition)] = efficiency_err * thrown_radius_factor

    # Fit sigmoid function to efficiency vs. energy distribution
    # fit_func = sigmoid_flat if sigmoid == 'flat' else sigmoid_slant
    poly_degree = 1
    num_params = poly_degree + 3
    fit_func = generate_fit_func(degree=poly_degree)
    # p0 = [7e4, 8.0, 50.0] if sigmoid == 'flat' else [7e4, 8.5, 50.0, 800]
    init_params = [8.5, 50.0, 7e4, 800]
    p0 = np.empty(num_params)
    p0[:min(num_params, len(init_params))] = init_params[:num_params]

    efficiencies_fit = {}
    energy_min_fit, energy_max_fit = 5.8, energybins.log_energy_max
    midpoints_fitmask = np.logical_and(bin_midpoints > energy_min_fit,
                                       bin_midpoints < energy_max_fit)
    # Find best-fit sigmoid function
    for composition in comp_list + ['total']:
        eff = efficiencies.loc[midpoints_fitmask, 'eff_{}'.format(composition)]
        eff_err = efficiencies.loc[midpoints_fitmask,
                                   'eff_err_{}'.format(composition)]
        popt, pcov = curve_fit(fit_func,
                               bin_midpoints[midpoints_fitmask],
                               eff,
                               p0=p0,
                               sigma=eff_err)
        eff_fit = fit_func(bin_midpoints, *popt)
        efficiencies_fit[composition] = eff_fit

        chi2 = np.sum((eff - eff_fit[midpoints_fitmask])**2 / (eff_err)**2)
        ndof = len(eff_fit[midpoints_fitmask]) - len(p0)
        print('({}) chi2 / ndof = {} / {} = {}'.format(composition, chi2, ndof,
                                                       chi2 / ndof))

    # Perform many fits to random statistical fluxuations of the best fit efficiency
    # This will be used to estimate the uncertainty in the best fit efficiency
    np.random.seed(2)
    efficiencies_fit_samples = defaultdict(list)
    for _ in xrange(n_samples):
        for composition in comp_list + ['total']:
            # Get new random sample to fit
            eff_err = efficiencies.loc[midpoints_fitmask,
                                       'eff_err_{}'.format(composition)]
            eff_sample = np.random.normal(
                efficiencies_fit[composition][midpoints_fitmask], eff_err)
            # Fit with error bars
            popt, pcov = curve_fit(fit_func,
                                   bin_midpoints[midpoints_fitmask],
                                   eff_sample,
                                   p0=p0,
                                   sigma=eff_err)

            eff_fit_sample = fit_func(bin_midpoints, *popt)
            efficiencies_fit_samples[composition].append(eff_fit_sample)

    # Calculate median and error of efficiency fits
    eff_fit = pd.DataFrame()
    for composition in comp_list + ['total']:
        fit_median, fit_err_low, fit_err_high = np.percentile(
            efficiencies_fit_samples[composition], (50, 16, 84), axis=0)
        fit_err_low = np.abs(fit_err_low - fit_median)
        fit_err_high = np.abs(fit_err_high - fit_median)

        eff_fit['eff_median_{}'.format(composition)] = fit_median
        eff_fit['eff_err_low_{}'.format(composition)] = fit_err_low
        eff_fit['eff_err_high_{}'.format(composition)] = fit_err_high

    return efficiencies.loc[midpoints_fitmask, :], eff_fit
コード例 #5
0
def get_classified_fractions(df_train,
                             df_test,
                             pipeline_str=None,
                             num_groups=4,
                             energy_key='MC_log_energy'):
    '''Calculates the fraction of correctly identified samples in each energy bin
    for each composition in comp_list. In addition, the statisitcal error for the
    fraction correctly identified is calculated.'''

    # Input validation
    if energy_key not in ['MC_log_energy', 'reco_log_energy']:
        raise ValueError(
            "Invalid energy_key ({}) entered. Must be either "
            "'MC_log_energy' or 'reco_log_energy'.".format(energy_key))

    if pipeline_str is None:
        pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups)

    # Fit pipeline and get mask for correctly identified events
    feature_list, feature_labels = comp.get_training_features()
    if 'CustomClassifier' in pipeline_str:
        pipeline = comp.get_pipeline(pipeline_str)
    else:
        pipeline = comp.load_trained_model(pipeline_str)
    comp_target_str = 'comp_target_{}'.format(num_groups)

    if 'CustomClassifier' in pipeline_str:
        test_predictions = pipeline.predict(
            df_test['comp_target_{}'.format(num_groups)])
    else:
        test_predictions = pipeline.predict(df_test[feature_list])
    pred_comp = np.array(
        comp.decode_composition_groups(test_predictions,
                                       num_groups=num_groups))

    data = {}
    for true_composition, identified_composition in product(
            comp_list, comp_list):
        true_comp_mask = df_test['comp_group_{}'.format(
            num_groups)] == true_composition
        ident_comp_mask = pred_comp == identified_composition

        # Get number of MC comp in each energy bin
        num_true_comp, _ = np.histogram(df_test.loc[true_comp_mask,
                                                    energy_key],
                                        bins=energybins.log_energy_bins)
        num_true_comp_err = np.sqrt(num_true_comp)

        # Get number of correctly identified comp in each energy bin
        combined_mask = true_comp_mask & ident_comp_mask
        num_identified_comp, _ = np.histogram(df_test.loc[combined_mask,
                                                          energy_key],
                                              bins=energybins.log_energy_bins)
        num_identified_comp_err = np.sqrt(num_identified_comp)

        # Calculate correctly identified fractions as a function of energy
        frac_identified, frac_identified_err = comp.ratio_error(
            num_identified_comp, num_identified_comp_err, num_true_comp,
            num_true_comp_err)
        data['true_{}_identified_{}'.format(
            true_composition, identified_composition)] = frac_identified
        data['true_{}_identified_{}_err'.format(
            true_composition, identified_composition)] = frac_identified_err

    return data