Exemple #1
0
def get_config_flux(config):

    sim_config = data_config_to_sim_config(config)

    pipeline_str = 'BDT'
    pipeline = comp.get_pipeline(pipeline_str)
    energybins = comp.analysis.get_energybins()
    # Load simulation and training features
    df_sim_train, df_sim_test = comp.load_sim(config=sim_config, verbose=False)
    feature_list, feature_labels = comp.analysis.get_training_features()
    # Load data
    df_data = comp.load_data(config=config)
    X_data = comp.dataframe_functions.dataframe_to_array(
        df_data, feature_list + ['lap_log_energy'])
    log_energy = X_data[:, -1]
    X_data = X_data[:, :-1]

    pipeline.fit(df_sim_train[feature_list], df_sim_train['target'])
    data_predictions = pipeline.predict(X_data)
    # Get composition masks
    data_labels = np.array([
        comp.dataframe_functions.label_to_comp(pred)
        for pred in data_predictions
    ])
    data_light_mask = data_labels == 'light'
    data_heavy_mask = data_labels == 'heavy'
    # Get number of identified comp in each energy bin
    df_flux = {}
    comp_list = ['light', 'heavy']
    for composition in comp_list:
        comp_mask = data_labels == composition
        df_flux['counts_' + composition] = np.histogram(
            log_energy[comp_mask], bins=energybins.log_energy_bins)[0]
        df_flux['counts_' + composition + '_err'] = np.sqrt(
            df_flux['counts_' + composition])

    df_flux['counts_total'] = np.histogram(log_energy,
                                           bins=energybins.log_energy_bins)[0]
    df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total'])
    # Solid angle
    max_zenith_rad = df_sim_train['lap_zenith'].max()
    solid_angle = 2 * np.pi * (1 - np.cos(max_zenith_rad))
    df_flux['solid_angle'] = solid_angle
    # Livetime
    livetime, livetime_err = comp.get_detector_livetime(config=config)
    df_flux['livetime'] = livetime
    df_flux['livetime_err'] = livetime_err

    return df_flux
                        choices=comp.simfunctions.get_sim_configs(),
                        help='Detector configuration')
    parser.add_argument('--n_jobs',
                        dest='n_jobs',
                        type=int,
                        default=10,
                        help='Number of jobs to run in parallel')
    args = parser.parse_args()

    config = args.config
    energybins = comp.get_energybins(config=config)
    log_energy_min = energybins.log_energy_min
    log_energy_max = energybins.log_energy_max

    print('Loading full pre-processed dataset for {} into memory...'.format(
        config))
    df_data = comp.load_data(config=config,
                             processed=False,
                             energy_reco=True,
                             energy_cut_key='reco_log_energy',
                             log_energy_min=log_energy_min,
                             log_energy_max=log_energy_max,
                             n_jobs=args.n_jobs,
                             verbose=True)

    outfile = os.path.join(comp.paths.comp_data_dir, config, 'data',
                           'data_dataframe_quality_cuts.hdf')
    comp.check_output_dir(outfile)
    print('Saving processed dataset to {}...'.format(outfile))
    df_data.to_hdf(outfile, 'dataframe', format='table')
    response, response_err = comp.response_matrix(
        true_energy=log_true_energy_sim_test,
        reco_energy=log_reco_energy_sim_test,
        true_target=true_target,
        pred_target=pred_target,
        efficiencies=efficiencies,
        efficiencies_err=efficiencies_err,
        energy_bins=energybins.log_energy_bins)

    # Run analysis pipeline on data
    print('Loading data into memory...')
    df_data = comp.load_data(config=config,
                             energy_reco=False,
                             log_energy_min=None,
                             log_energy_max=None,
                             columns=feature_list,
                             n_jobs=10,
                             verbose=True)

    print('Running energy and composition reconstructions...')
    df_data['pred_comp_target'] = comp_pipeline.predict(
        df_data[feature_list].values)
    df_data['reco_log_energy'] = energy_pipeline.predict(
        df_data[feature_list].values)

    counts_observed = {}
    counts_observed_err = {}
    for idx, composition in enumerate(comp_list):
        # Filter out events that don't pass composition & energy mask
        pred_comp_mask = df_data['pred_comp_target'] == idx
Exemple #4
0
    df_eff = pd.read_hdf(eff_path)
    # Format detection efficiencies for PyUnfold use
    efficiencies = np.empty(num_groups * len(energybins.energy_midpoints))
    efficiencies_err = np.empty(num_groups * len(energybins.energy_midpoints))
    for idx, composition in enumerate(comp_list):
        efficiencies[idx::num_groups] = df_eff['eff_median_{}'.format(
            composition)]
        efficiencies_err[idx::num_groups] = df_eff['eff_err_low_{}'.format(
            composition)]

    # Load data DataFrame
    print('Loading data DataFrame...')
    df_data = comp.load_data(config=config,
                             columns=feature_list,
                             energy_cut_key='reco_log_energy',
                             log_energy_min=log_energy_min,
                             log_energy_max=log_energy_max,
                             n_jobs=n_jobs,
                             verbose=True)

    X_data = comp.io.dataframe_to_array(df_data,
                                        feature_list + ['reco_log_energy'])
    log_energy_data = X_data[:, -1]
    X_data = X_data[:, :-1]

    print('Making composition predictions on data...')
    # Apply pipeline.predict method in chunks for parallel predicting
    X_da = da.from_array(X_data, chunks=(len(X_data) // 100, X_data.shape[1]))
    data_predictions = da.map_blocks(pipeline.predict,
                                     X_da,
                                     dtype=int,
def save_data_MC_plots(config, june_july_only):

    df_sim = comp.load_sim(config='IC86.2012', test_size=0, verbose=False)
    # energy_mask_sim = (df_sim['lap_log_energy'] > 6.0)
    # energy_mask_sim = (df_sim['lap_log_energy'] > 6.4) & (df_sim['lap_log_energy'] < 8.0)
    # df_sim = df_sim[energy_mask_sim]

    df_data = comp.load_data(config=config, verbose=False)
    df_data = df_data[np.isfinite(df_data['log_dEdX'])]
    # energy_mask_data = (df_data['lap_log_energy'] > 6.4) & (df_data['lap_log_energy'] < 8.0)
    # df_data = df_data[energy_mask_data]

    if june_july_only:
        print('Masking out all data events not in June or July')

        def is_june_july(time):
            i3_time = dataclasses.I3Time(time)
            return i3_time.date_time.month in [6, 7]

        june_july_mask = df_data.end_time_mjd.apply(is_june_july)
        df_data = df_data[june_july_mask].reset_index(drop=True)

    months = (6, 7) if june_july_only else None
    livetime, livetime_err = comp.get_detector_livetime(config, months=months)

    weights = get_sim_weights(df_sim)
    df_sim['weights'] = flux(df_sim['MC_energy']) * weights

    MC_comp_mask = {}
    comp_list = ['PPlus', 'Fe56Nucleus']
    for composition in comp_list:
        MC_comp_mask[composition] = df_sim['MC_comp'] == composition
    #     MC_comp_mask[composition] = df_sim['MC_comp_class'] == composition

    # S125 data-MC plot
    log_s125_bins = np.linspace(-0.5, 3.5, 50)
    gs_s125 = plot_data_MC_comparison(df_sim,
                                      df_data,
                                      'log_s125',
                                      log_s125_bins,
                                      '$\mathrm{\log_{10}(S_{125})}$',
                                      livetime,
                                      ylim_ratio=(0, 2))
    s125_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison',
                                's125_{}.png'.format(config))
    plt.savefig(s125_outfile)

    # dE/dX data-MC plot
    log_dEdX_bins = np.linspace(-2, 4, 50)
    gs_dEdX = plot_data_MC_comparison(df_sim,
                                      df_data,
                                      'log_dEdX',
                                      log_dEdX_bins,
                                      '$\mathrm{\log_{10}(dE/dX)}$',
                                      livetime,
                                      ylim_ratio=(0, 5.5))
    dEdX_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison',
                                'dEdX_{}.png'.format(config))
    plt.savefig(dEdX_outfile)

    # cos(zenith) data-MC plot
    cos_zenith_bins = np.linspace(0.8, 1.0, 50)
    gs_zenith = plot_data_MC_comparison(df_sim,
                                        df_data,
                                        'lap_cos_zenith',
                                        cos_zenith_bins,
                                        '$\mathrm{\cos(\\theta_{reco})}$',
                                        livetime,
                                        ylim_ratio=(0, 3))
    zenith_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison',
                                  'zenith_{}.png'.format(config))
    plt.savefig(zenith_outfile)

    # InIce median radius data-MC plot
    inice_radius_bins = np.linspace(0, 200, 50)
    gs_inice_radius = plot_data_MC_comparison(
        df_sim,
        df_data,
        'median_inice_radius',
        inice_radius_bins,
        '$\mathrm{\cos(\\theta_{reco})}$',
        livetime,
        ylim_ratio=(0, 3))
    inice_radius_outfile = os.path.join(
        comp.paths.figures_dir, 'data-MC-comparison',
        'median_inice_radius_{}.png'.format(config))
    plt.savefig(inice_radius_outfile)

    # log_d4r_peak_energy data-MC plot
    log_d4r_peak_energy_bins = np.linspace(-0.5, 3.5, 50)
    gs_d4R_peak_energy = plot_data_MC_comparison(
        df_sim,
        df_data,
        'log_d4r_peak_energy',
        log_d4r_peak_energy_bins,
        '$\mathrm{\log_{10}(E_{D4R}/GeV)}$',
        livetime,
        ylim_ratio=(0, 5.5))
    d4R_peak_energy_outfile = os.path.join(
        comp.paths.figures_dir, 'data-MC-comparison',
        'd4R_peak_energy_{}.png'.format(config))
    plt.savefig(d4R_peak_energy_outfile)

    # log_d4r_peak_sigma data-MC plot
    log_d4r_peak_sigma_bins = np.linspace(-1, 3, 50)
    gs_d4R_peak_sigma = plot_data_MC_comparison(
        df_sim,
        df_data,
        'log_d4r_peak_sigma',
        log_d4r_peak_sigma_bins,
        '$\mathrm{\log_{10}(E_{D4R}/GeV)}$',
        livetime,
        ylim_ratio=(0, 5.5))
    d4R_peak_sigma_outfile = os.path.join(
        comp.paths.figures_dir, 'data-MC-comparison',
        'd4R_peak_sigma_{}.png'.format(config))
    plt.savefig(d4R_peak_sigma_outfile)