Example #1
0
def main():
    # Read arguments
    parser = argparse.ArgumentParser(description='Make performance files')
    parser.add_argument('--config_file', type=str, required=True, help='')
    parser.add_argument(
        '--obs_time',
        type=str,
        required=True,
        help=
        'Observation time, should be given as a string, value and astropy unit separated by an empty space'
    )
    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument('--wave',
                            dest="mode",
                            action='store_const',
                            const="wave",
                            default="tail",
                            help="if set, use wavelet cleaning")
    mode_group.add_argument(
        '--tail',
        dest="mode",
        action='store_const',
        const="tail",
        help="if set, use tail cleaning, otherwise wavelets")
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    # Add obs. time in configuration file
    str_obs_time = args.obs_time.split()
    cfg['analysis']['obs_time'] = {
        'value': float(str_obs_time[0]),
        'unit': str(str_obs_time[-1])
    }

    # Create output directory if necessary
    outdir = os.path.join(
        cfg['general']['outdir'], 'irf_{}_ThSq_{}_Time{:.2f}{}'.format(
            args.mode, cfg['analysis']['thsq_opt']['type'],
            cfg['analysis']['obs_time']['value'],
            cfg['analysis']['obs_time']['unit']))
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    indir = cfg['general']['indir']
    template_input_file = cfg['general']['template_input_file']

    # Load data
    particles = ['gamma', 'electron', 'proton']
    evt_dict = dict()  # Contain DL2 file for each type of particle
    for particle in particles:
        # template looks like dl2_{}_{}_merged.h5
        infile = os.path.join(indir,
                              template_input_file.format(args.mode, particle))
        evt_dict[particle] = pd.read_hdf(infile, key='reco_events')

    # Apply offset cut to proton and electron
    for particle in ['electron', 'proton']:
        # print('Initial stat: {} {}'.format(len(evt_dict[particle]), particle))
        evt_dict[particle] = evt_dict[particle].query('offset <= {}'.format(
            cfg['particle_information'][particle]['offset_cut']))

    # Add required data in configuration file for future computation
    for particle in particles:
        cfg['particle_information'][particle]['n_files'] = \
            len(np.unique(evt_dict[particle]['obs_id']))
        cfg['particle_information'][particle]['n_simulated'] = \
            cfg['particle_information'][particle]['n_files'] * cfg['particle_information'][particle]['n_events_per_file']

    # Define model for the particles
    model_dict = {
        'gamma': CrabSpectrum('hegra').model,
        'proton': cosmic_ray_flux,
        'electron': cosmic_ray_flux
    }

    # Reco energy binning
    cfg_binning = cfg['analysis']['ereco_binning']
    ereco = np.logspace(np.log10(cfg_binning['emin']),
                        np.log10(cfg_binning['emax']),
                        cfg_binning['nbin'] + 1) * u.TeV

    # Handle theta square cut optimisation
    # (compute 68 % containment radius PSF if necessary)
    thsq_opt_type = cfg['analysis']['thsq_opt']['type']
    if thsq_opt_type in 'fixed':
        thsq_values = np.array([cfg['analysis']['thsq_opt']['value']]) * u.deg
        print('Using fixed theta cut: {}'.format(thsq_values))
    elif thsq_opt_type in 'opti':
        thsq_values = np.arange(0.05, 0.40, 0.01) * u.deg
        print('Optimising theta cut for: {}'.format(thsq_values))
    elif thsq_opt_type in 'r68':
        print('Using R68% theta cut')
        print('Computing...')
        cfg_binning = cfg['analysis']['ereco_binning']
        ereco = np.logspace(np.log10(cfg_binning['emin']),
                            np.log10(cfg_binning['emax']),
                            cfg_binning['nbin'] + 1) * u.TeV
        radius = 68

        thsq_values = list()
        for ibin in range(len(ereco) - 1):
            emin = ereco[ibin]
            emax = ereco[ibin + 1]

            energy_query = 'reco_energy > {} and reco_energy <= {}'.format(
                emin.value, emax.value)
            data = evt_dict['gamma'].query(energy_query).copy()

            min_stat = 0
            if len(data) <= min_stat:
                print('  ==> Not enough statistics:')
                print('To be handled...')
                thsq_values.append(0.3)
                continue
                # import sys
                # sys.exit()

            psf = np.percentile(data['offset'], radius)
            psf_err = psf / np.sqrt(len(data))

            thsq_values.append(psf)
        thsq_values = np.array(thsq_values) * u.deg
        # Set 0.05 as a lower value
        idx = np.where(thsq_values.value < 0.05)
        thsq_values[idx] = 0.05 * u.deg
        print('Using theta cut: {}'.format(thsq_values))

    # Cuts optimisation
    print('### Finding best cuts...')
    cut_optimiser = CutsOptimisation(config=cfg,
                                     evt_dict=evt_dict,
                                     verbose_level=0)

    # Weight events
    print('- Weighting events...')
    cut_optimiser.weight_events(
        model_dict=model_dict,
        colname_mc_energy=cfg['column_definition']['mc_energy'])

    # Find best cutoff to reach best sensitivity
    print('- Estimating cutoffs...')
    cut_optimiser.find_best_cutoff(energy_values=ereco,
                                   angular_values=thsq_values)

    # Save results and auxiliary data for diagnostic
    print('- Saving results to disk...')
    cut_optimiser.write_results(outdir,
                                '{}.fits'.format(
                                    cfg['general']['output_table_name']),
                                format='fits')

    # Cuts diagnostic
    print('### Building cut diagnostics...')
    cut_diagnostic = CutsDiagnostic(config=cfg, indir=outdir)
    cut_diagnostic.plot_optimisation_summary()
    cut_diagnostic.plot_diagnostics()

    # Apply cuts and save data
    print('### Applying cuts to data...')
    cut_applicator = CutsApplicator(config=cfg,
                                    evt_dict=evt_dict,
                                    outdir=outdir)
    cut_applicator.apply_cuts()

    # Irf Maker
    print('### Building IRF...')
    irf_maker = IrfMaker(config=cfg, evt_dict=evt_dict, outdir=outdir)
    irf_maker.build_irf()

    # Sensitivity maker
    print('### Estimating sensitivity...')
    sensitivity_maker = SensitivityMaker(config=cfg, outdir=outdir)
    sensitivity_maker.load_irf()
    sensitivity_maker.estimate_sensitivity()
Example #2
0
def main():

    # INITIALIZE CLI arguments
    args = initialize_script_arguments()

    # LOAD CONFIGURATION FILE
    cfg = load_config(args.config_file)

    # INPUT CONFIGURATION

    # Import parameters
    if args.indir is None:
        data_dir = cfg["General"]["data_dir"]
    else:
        data_dir = args.indir

    if args.outdir is None:
        outdir = cfg["General"]["outdir"]
    else:
        outdir = args.outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # Get file containing gammas (signal)
    if args.infile_signal is None:
        data_sig_file = cfg["General"]["data_sig_file"].format(args.mode)
    else:
        data_sig_file = args.infile_signal

    filename_sig = path.join(data_dir, data_sig_file)

    print(f"INPUT SIGNAL FILE PATH= {filename_sig}")

    # Cameras to use
    if args.cameras_from_config:
        print("GETTING CAMERAS FROM CONFIGURATION FILE")
        cam_ids = cfg["General"]["cam_id_list"]
    elif args.cameras_from_file:
        print("GETTING CAMERAS FROM SIGNAL TRAINING FILE")
        # in the same analysis all particle types are analyzed in the
        # same way so we can just use gammas
        cam_ids = get_camera_names(filename_sig)
    else:
        print("GETTING CAMERAS FROM CLI")
        cam_ids = args.cam_id_lists.split()

    # The names of the tables inside the HDF5 file are the camera's names
    table_name = [cam_id for cam_id in cam_ids]

    # Dataset split train-test fraction
    train_fraction = cfg["Split"]["train_fraction"]
    # Name of target quantity
    target_name = cfg["Method"]["target_name"]

    # Get list of features
    features_basic = cfg["FeatureList"]["Basic"]
    features_derived = cfg["FeatureList"]["Derived"]
    feature_list = features_basic + list(features_derived)
    print("Going to use the following features to train the model:")
    print(feature_list)
    # sort features_to_use alphabetically to ensure order
    # preservation with model.predict in protopipe.scripts
    feature_list = sorted(feature_list)

    # GridSearchCV
    use_GridSearchCV = cfg["GridSearchCV"]["use"]
    scoring = cfg["GridSearchCV"]["scoring"]
    cv = cfg["GridSearchCV"]["cv"]

    # Hyper-parameters of the main model
    tuned_parameters = cfg["Method"]["tuned_parameters"]

    # Initialize the model dynamically

    # There always at least one (main) model to initialize
    model_to_use = cfg['Method']['name']
    module_name = '.'.join(model_to_use.split('.', 2)[:-1])
    class_name = model_to_use.split('.')[-1]
    module = importlib.import_module(module_name)  # sklearn.XXX
    model = getattr(module, class_name)
    print(f"Going to use {module_name}.{class_name}...")

    # Check for any base estimator if main model is a meta-estimator
    if "base_estimator" in cfg['Method']:
        base_estimator_cfg = cfg['Method']['base_estimator']
        base_estimator_name = base_estimator_cfg['name']
        base_estimator_pars = base_estimator_cfg['parameters']
        base_estimator_module_name = '.'.join(
            base_estimator_name.split('.', 2)[:-1])
        base_estimator_class_name = base_estimator_name.split('.')[-1]
        base_estimator_module = importlib.import_module(
            base_estimator_module_name)  # sklearn.XXX
        base_estimator_model = getattr(base_estimator_module,
                                       base_estimator_class_name)
        initialized_base_estimator = base_estimator_model(
            **base_estimator_pars)
        print(
            f"...based on {base_estimator_module_name}.{base_estimator_class_name}"
        )
        initialized_model = model(base_estimator=initialized_base_estimator,
                                  **cfg['Method']['tuned_parameters'])
    else:
        initialized_model = model(**cfg['Method']['tuned_parameters'])

    # Map model types to the models supported by the script
    model_types = {
        "regressor": ["RandomForestRegressor", "AdaBoostRegressor"],
        "classifier": ["RandomForestClassifier"]
    }

    if class_name in model_types["regressor"]:

        # Get the selection cuts
        cuts = make_cut_list(cfg["SigFiducialCuts"])

    elif class_name in model_types["classifier"]:

        # read background file from either config file or CLI
        if args.infile_background is None:
            data_bkg_file = cfg["General"]["data_bkg_file"].format(args.mode)
        else:
            data_bkg_file = args.infile_background

        # filename_sig = path.join(data_dir, data_sig_file)
        filename_bkg = path.join(data_dir, data_bkg_file)

        # table_name = [table_name_template + cam_id for cam_id in cam_ids]

        # Get the selection cuts
        sig_cuts = make_cut_list(cfg["SigFiducialCuts"])
        bkg_cuts = make_cut_list(cfg["BkgFiducialCuts"])

        use_same_number_of_sig_and_bkg_for_training = cfg["Split"][
            "use_same_number_of_sig_and_bkg_for_training"]

    else:
        raise ValueError("ERROR: not a supported model")

    print("### Using {} for model construction".format(model_to_use))

    print(f"LIST OF CAMERAS TO USE = {cam_ids}")

    models = dict()
    for idx, cam_id in enumerate(cam_ids):

        print("### Building model for {}".format(cam_id))

        if class_name in model_types["regressor"]:

            # Load data
            data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r")
            # Add any derived feature and apply fiducial cuts
            data_sig = prepare_data(ds=data_sig,
                                    derived_features=features_derived,
                                    select_data=True,
                                    cuts=cuts)

            if args.max_events:
                data_sig = data_sig[0:args.max_events]

            print(f"Going to split {len(data_sig)} SIGNAL images...")

            # Initialize the model
            factory = TrainModel(case="regressor",
                                 target_name=target_name,
                                 feature_name_list=feature_list)

            # Split the TRAINING dataset in a train and test sub-datasets
            # Useful to test the models before using them for DL2 production
            factory.split_data(data_sig=data_sig,
                               train_fraction=train_fraction)
            print("Training sample: sig {}".format(len(factory.data_train)))
            print("Test sample: sig {}".format(len(factory.data_test)))

        else:  # if it's not a regressor it's a classifier

            # Load data
            data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r")
            data_bkg = pd.read_hdf(filename_bkg, table_name[idx], mode="r")

            # Add label
            data_sig = prepare_data(ds=data_sig,
                                    label=1,
                                    cuts=sig_cuts,
                                    select_data=True,
                                    derived_features=features_derived)
            data_bkg = prepare_data(ds=data_bkg,
                                    label=0,
                                    cuts=bkg_cuts,
                                    select_data=True,
                                    derived_features=features_derived)

            if args.max_events:
                data_sig = data_sig[0:args.max_events]
                data_bkg = data_bkg[0:args.max_events]

            print(
                f"Going to split {len(data_sig)} SIGNAL images and {len(data_bkg)} BACKGROUND images"
            )

            # Initialize the model
            factory = TrainModel(case="classifier",
                                 target_name=target_name,
                                 feature_name_list=feature_list)

            # Split the TRAINING dataset in a train and test sub-datasets
            # Useful to test the models before using them for DL2 production
            factory.split_data(
                data_sig=data_sig,
                data_bkg=data_bkg,
                train_fraction=train_fraction,
                force_same_nsig_nbkg=
                use_same_number_of_sig_and_bkg_for_training,
            )

            print("Training sample: sig {} and bkg {}".format(
                len(factory.data_train.query("label==1")),
                len(factory.data_train.query("label==0")),
            ))
            print("Test sample: sig {} and bkg {}".format(
                len(factory.data_test.query("label==1")),
                len(factory.data_test.query("label==0")),
            ))

        if use_GridSearchCV:
            # Apply optimization of the hyper-parameters via grid search
            # and return best model
            best_model = factory.get_optimal_model(initialized_model,
                                                   tuned_parameters,
                                                   scoring=scoring,
                                                   cv=cv)
        else:  # otherwise use directly the initial model
            best_model = initialized_model

            # Fit the chosen model on the train data
            best_model.fit(
                factory.data_scikit["X_train"],
                factory.data_scikit["y_train"],
                sample_weight=factory.data_scikit["w_train"],
            )

        if class_name in model_types["classifier"]:

            print(
                classification_report(
                    factory.data_scikit["y_test"],
                    best_model.predict(factory.data_scikit["X_test"]),
                ))

            # Calibrate model if necessary on test data (GridSearchCV)
            if use_GridSearchCV and cfg["Method"]["calibrate_output"]:
                print("==> Calibrate classifier...")

                best_model = CalibratedClassifierCV(best_model,
                                                    method="sigmoid",
                                                    cv="prefit")

                best_model.fit(factory.data_scikit["X_test"],
                               factory.data_scikit["y_test"])

        save_output(models, cam_id, factory, best_model, model_types,
                    class_name, outdir)
def main():
    # Read arguments
    parser = argparse.ArgumentParser(description="Make diagnostic plot")
    parser.add_argument("--config_file", type=str, required=True)
    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument(
        "--wave",
        dest="mode",
        action="store_const",
        const="wave",
        default="tail",
        help="if set, use wavelet cleaning",
    )
    mode_group.add_argument(
        "--tail",
        dest="mode",
        action="store_const",
        const="tail",
        help="if set, use tail cleaning, otherwise wavelets",
    )
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    model_type = cfg["General"]["model_type"]

    # Import parameters
    indir = cfg["General"]["outdir"]

    cam_ids = cfg["General"]["cam_id_list"]

    # Model
    method_name = cfg["Method"]["name"]
    target_name = cfg["Method"]["target_name"]
    if model_type in "classifier":
        use_proba = cfg["Method"]["use_proba"]

    # Diagnostic
    nbins = cfg["Diagnostic"]["energy"]["nbins"]
    energy_edges = np.logspace(
        np.log10(cfg["Diagnostic"]["energy"]["min"]),
        np.log10(cfg["Diagnostic"]["energy"]["max"]),
        nbins + 1,
        True,
    )

    # Will be further used to get model output of events
    diagnostic = dict()

    for idx, cam_id in enumerate(cam_ids):
        print("### Model diagnostic for {}".format(cam_id))

        # Load data
        data_scikit = load_obj(
            path.join(
                indir,
                "data_scikit_{}_{}_{}_{}.pkl.gz".format(
                    model_type, method_name, args.mode, cam_id
                ),
            )
        )
        data_train = pd.read_pickle(
            path.join(
                indir,
                "data_train_{}_{}_{}_{}.pkl.gz".format(
                    model_type, method_name, args.mode, cam_id
                ),
            )
        )
        data_test = pd.read_pickle(
            path.join(
                indir,
                "data_test_{}_{}_{}_{}.pkl.gz".format(
                    model_type, method_name, args.mode, cam_id
                ),
            )
        )

        # Load model
        outname = "{}_{}_{}_{}.pkl.gz".format(
            model_type, args.mode, cam_id, method_name
        )
        model = joblib.load(path.join(indir, outname))

        outdir = os.path.join(
            indir,
            "diagnostic_{}_{}_{}_{}".format(model_type, method_name, args.mode, cam_id),
        )
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        if model_type in "regressor":
            diagnostic[cam_id] = RegressorDiagnostic(
                model=model,
                feature_name_list=cfg["FeatureList"],
                target_name=target_name,
                data_train=data_train,
                data_test=data_test,
                output_name="reco_energy",
            )
        elif model_type in "classifier":

            if use_proba is True:
                ouput_model_name = "gammaness"
            else:
                ouput_model_name = "score"

            diagnostic[cam_id] = ClassifierDiagnostic(
                model=model,
                feature_name_list=cfg["FeatureList"],
                target_name=target_name,
                data_train=data_train,
                data_test=data_test,
                model_output_name=ouput_model_name,
                is_output_proba=use_proba,
            )

        # Image-level diagnostic - feature importance
        plt.figure(figsize=(5, 5))
        ax = plt.gca()
        ax = diagnostic[cam_id].plot_feature_importance(
            ax,
            **{"alpha": 0.7, "edgecolor": "black", "linewidth": 2, "color": "darkgreen"}
        )
        ax.set_ylabel("Feature importance")
        ax.grid()
        plt.title(cam_id)
        plt.tight_layout()
        save_fig(outdir, "feature_importances")

        # Diagnostic for regressor
        if model_type in "regressor":

            # Image-level diagnostic[cam_id] - features
            fig, axes = diagnostic[cam_id].plot_features(
                data_list=[data_train, data_test],
                nbin=30,
                hist_kwargs_list=[
                    {
                        "edgecolor": "blue",
                        "color": "blue",
                        "label": "Gamma training",
                        "alpha": 0.2,
                        "fill": True,
                        "ls": "-",
                        "lw": 2,
                    },
                    {
                        "edgecolor": "blue",
                        "color": "blue",
                        "label": "Gamma test",
                        "alpha": 1,
                        "fill": False,
                        "ls": "--",
                        "lw": 2,
                    },
                ],
                error_kw_list=[
                    dict(ecolor="blue", lw=2, capsize=2, capthick=2, alpha=0.2),
                    dict(ecolor="blue", lw=2, capsize=2, capthick=2, alpha=0.2),
                ],
                ncols=3,
            )
            plt.title(cam_id)
            fig.tight_layout()
            save_fig(outdir, "features", fig=fig)

            # Compute averaged energy
            print("Process test sample...")
            data_test_evt = get_evt_subarray_model_output(
                data_test,
                weight_name="sum_signal_cam",
                keep_cols=["mc_energy"],
                model_output_name="reco_energy_img",
                model_output_name_evt="reco_energy",
            )

            ncols = 5
            nrows = (
                int(nbins / ncols) if nbins % ncols == 0 else int((nbins + 1) / ncols)
            )
            if nrows == 0:
                nrows = 1
                ncols = 1
            fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5 * 5, 10))
            try:
                axes = axes.flatten()
            except:
                axes = [axes]

            bias = []
            resolution = []
            energy_centres = []

            for ibin in range(len(energy_edges) - 1):
                ax = axes[ibin]

                data = data_test_evt.query(
                    "mc_energy >= {} and mc_energy < {}".format(
                        energy_edges[ibin], energy_edges[ibin + 1]
                    )
                )
                print("Estimate energy for {} evts".format(len(data)))

                er = data["reco_energy"]
                emc = data["mc_energy"]

                opt_hist = {
                    "edgecolor": "black",
                    "color": "darkgreen",
                    "label": "data",
                    "alpha": 0.7,
                    "fill": True,
                }
                opt_fit = {"c": "red", "lw": 2, "label": "Best fit"}
                ax, fit_param, cov = diagnostic[cam_id].plot_resolution_distribution(
                    ax=ax,
                    y_true=emc,
                    y_reco=er,
                    nbin=50,
                    fit_range=[-2, 2],
                    hist_kwargs=opt_hist,
                    fit_kwargs=opt_fit,
                )
                if fit_param[2] < 0:  # negative value are allowed for the fit
                    fit_param[2] *= -1

                label = "[{:.2f},{:.2f}] TeV\n#Evts={}\nmean={:.2f}\nstd={:.2f}".format(
                    energy_edges[ibin],
                    energy_edges[ibin + 1],
                    len(er),
                    fit_param[1],
                    fit_param[2],
                )

                ax.set_ylabel("# Evts")
                ax.set_xlabel("(ereco-emc) / emc")
                ax.set_xlim([-2, 2])
                ax.grid()

                evt_patch = mpatches.Patch(color="white", label=label)
                data_patch = mpatches.Patch(color="blue", label="data")
                fit_patch = mpatches.Patch(color="red", label="best fit")
                ax.legend(loc="best", handles=[evt_patch, data_patch, fit_patch])
                plt.tight_layout()

                print(
                    " Fit results: ({:.3f},{:.3f} TeV)".format(
                        energy_edges[ibin], energy_edges[ibin + 1]
                    )
                )

                try:
                    print(" - A    : {:.3f} +/- {:.3f}".format(fit_param[0], cov[0][0]))
                    print(" - mean : {:.3f} +/- {:.3f}".format(fit_param[1], cov[1][1]))
                    print(" - std  : {:.3f} +/- {:.3f}".format(fit_param[2], cov[2][2]))
                except:
                    print(" ==> Problem with fit, no covariance...".format())
                    continue

                bias.append(fit_param[1])
                resolution.append(fit_param[2])
                energy_centres.append(
                    (energy_edges[ibin] + energy_edges[ibin + 1]) / 2.0
                )

            save_fig(outdir, "migration_distribution", fig=fig)

            plt.figure(figsize=(5, 5))
            ax = plt.gca()
            ax.plot(
                energy_centres,
                resolution,
                marker="s",
                color="darkorange",
                label="Resolution",
            )
            ax.plot(energy_centres, bias, marker="s", color="darkgreen", label="Bias")
            ax.set_xlabel("True energy [TeV]")
            ax.set_ylabel("Energy resolution")
            ax.set_xscale("log")
            ax.grid()
            ax.legend()
            ax.set_ylim([-0.2, 1.2])
            plt.title(cam_id)
            plt.tight_layout()
            save_fig(outdir, "energy_resolution")

            # Write results
            t = Table()
            t["ENERGY"] = Column(
                energy_centres, unit="TeV", description="Energy centers"
            )
            t["BIAS"] = Column(bias, unit="", description="Bias from gauusian fit")
            t["RESOL"] = Column(
                bias, unit="", description="Resolution from gauusian fit"
            )
            t.write(
                os.path.join(outdir, "energy_resolution.fits"),
                format="fits",
                overwrite=True,
            )

        elif model_type in "classifier":

            # Image-level diagnostic - features
            fig, axes = diagnostic[cam_id].plot_features(
                data_list=[
                    data_train.query("label==1"),
                    data_test.query("label==1"),
                    data_train.query("label==0"),
                    data_test.query("label==0"),
                ],
                nbin=30,
                hist_kwargs_list=[
                    {
                        "edgecolor": "blue",
                        "color": "blue",
                        "label": "Gamma training sample",
                        "alpha": 0.2,
                        "fill": True,
                        "ls": "-",
                        "lw": 2,
                    },
                    {
                        "edgecolor": "blue",
                        "color": "blue",
                        "label": "Gamma test sample",
                        "alpha": 1,
                        "fill": False,
                        "ls": "--",
                        "lw": 2,
                    },
                    {
                        "edgecolor": "red",
                        "color": "red",
                        "label": "Proton training sample",
                        "alpha": 0.2,
                        "fill": True,
                        "ls": "-",
                        "lw": 2,
                    },
                    {
                        "edgecolor": "red",
                        "color": "red",
                        "label": "Proton test sample",
                        "alpha": 1,
                        "fill": False,
                        "ls": "--",
                        "lw": 2,
                    },
                ],
                error_kw_list=[
                    dict(ecolor="blue", lw=2, capsize=3, capthick=2, alpha=0.2),
                    dict(ecolor="blue", lw=2, capsize=3, capthick=2, alpha=1),
                    dict(ecolor="red", lw=2, capsize=3, capthick=2, alpha=0.2),
                    dict(ecolor="red", lw=2, capsize=3, capthick=2, alpha=1),
                ],
                ncols=3,
            )
            plt.title(cam_id)
            fig.tight_layout()
            save_fig(outdir, "features", fig=fig)

            if method_name in "AdaBoostClassifier":
                # Image-level diagnostic - method
                plt.figure(figsize=(5, 5))
                ax = plt.gca()
                opt = {"color": "darkgreen", "ls": "-", "lw": 2}
                BoostedDecisionTreeDiagnostic.plot_error_rate(
                    ax, model, data_scikit, **opt
                )
                plt.title(cam_id)
                plt.tight_layout()
                save_fig(path, outdir, "bdt_diagnostic_error_rate")

                plt.figure(figsize=(5, 5))
                ax = plt.gca()
                BoostedDecisionTreeDiagnostic.plot_tree_error_rate(ax, model, **opt)
                plt.title(cam_id)
                plt.tight_layout()
                save_fig(path, outdir, "bdt_diagnostic_tree_error_rate")

            # Image-level diagnostic - model output
            fig, ax = diagnostic[cam_id].plot_image_model_output_distribution(nbin=50)
            ax[0].set_xlim([0, 1])
            plt.title(cam_id)
            fig.tight_layout()
            save_fig(outdir, "image_distribution", fig=fig)

            # Image-level diagnostic - ROC curve on train and test samples
            plt.figure(figsize=(5, 5))
            ax = plt.gca()
            plot_roc_curve(
                ax,
                diagnostic[cam_id].data_train[diagnostic[cam_id].model_output_name],
                diagnostic[cam_id].data_train["label"],
                **dict(color="darkgreen", lw=2, label="Training sample")
            )
            plot_roc_curve(
                ax,
                data_test[diagnostic[cam_id].model_output_name],
                diagnostic[cam_id].data_test["label"],
                **dict(color="darkorange", lw=2, label="Test sample")
            )
            ax.set_xlabel("False Positive Rate")
            ax.set_ylabel("True Positive Rate")
            ax.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
            ax.legend(loc="lower right")
            plt.title(cam_id)
            plt.tight_layout()
            save_fig(outdir, "image_roc_curve")

            # Parameters for energy variation
            cut_list = [
                "reco_energy >= {:.2f} and reco_energy <= {:.2f}".format(
                    energy_edges[i], energy_edges[i + 1]
                )
                for i in range(len(energy_edges) - 1)
            ]

            hist_kwargs_list = [
                {
                    "edgecolor": "blue",
                    "color": "blue",
                    "label": "Gamma training sample",
                    "alpha": 0.2,
                    "fill": True,
                    "ls": "-",
                    "lw": 2,
                },
                {
                    "edgecolor": "blue",
                    "color": "blue",
                    "label": "Gamma test sample",
                    "alpha": 1,
                    "fill": False,
                    "ls": "--",
                    "lw": 2,
                },
                {
                    "edgecolor": "red",
                    "color": "red",
                    "label": "Proton training sample",
                    "alpha": 0.2,
                    "fill": True,
                    "ls": "-",
                    "lw": 2,
                },
                {
                    "edgecolor": "red",
                    "color": "red",
                    "label": "Proton test sample",
                    "alpha": 1,
                    "fill": False,
                    "ls": "--",
                    "lw": 2,
                },
            ]

            error_kw_list = [
                dict(ecolor="blue", lw=2, capsize=3, capthick=2, alpha=0.2),
                dict(ecolor="blue", lw=2, capsize=3, capthick=2, alpha=1),
                dict(ecolor="red", lw=2, capsize=3, capthick=2, alpha=0.2),
                dict(ecolor="red", lw=2, capsize=3, capthick=2, alpha=1),
            ]

            # Image-level diagnostic - model output distribution variation
            n_feature = len(cut_list)
            ncols = 2
            nrows = (
                int(n_feature / ncols)
                if n_feature % ncols == 0
                else int((n_feature + 1) / ncols)
            )
            fig, axes = plt.subplots(
                nrows=nrows, ncols=ncols, figsize=(5 * ncols, 3 * nrows)
            )
            if nrows == 1 and ncols == 1:
                axes = [axes]
            else:
                axes = axes.flatten()

            data_list = [
                data_train.query("label==1"),
                data_test.query("label==1"),
                data_train.query("label==0"),
                data_test.query("label==0"),
            ]

            for i, colname in enumerate(cut_list):
                ax = axes[i]

                # Range for binning
                the_range = [0, 1]

                for j, data in enumerate(data_list):
                    if len(data) == 0:
                        continue

                    ax = plot_hist(
                        ax=ax,
                        data=data.query(cut_list[i])[ouput_model_name],
                        nbin=30,
                        limit=the_range,
                        norm=True,
                        yerr=True,
                        hist_kwargs=hist_kwargs_list[j],
                        error_kw=error_kw_list[j],
                    )

                ax.set_xlim(the_range)
                ax.set_xlabel(ouput_model_name)
                ax.set_ylabel("Arbitrary units")
                ax.legend(loc="best", fontsize="x-small")
                ax.set_title(cut_list[i])
                ax.grid()
            fig.tight_layout()
            save_fig(outdir, "image_distribution_variation", fig=fig)

            # Image-level diagnostic - ROC curve variation on test sample
            plt.figure(figsize=(5, 5))
            ax = plt.gca()

            color = 1.0
            step_color = 1.0 / (len(cut_list))
            for i, cut in enumerate(cut_list):
                c = color - (i + 1) * step_color

                data = data_test.query(cut)
                if len(data) == 0:
                    continue

                opt = dict(
                    color=str(c),
                    lw=2,
                    label="{}".format(cut.replace("reco_energy", "E")),
                )
                plot_roc_curve(ax, data[ouput_model_name], data["label"], **opt)
            ax.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
            ax.set_title(cam_id)
            ax.set_xlabel("False Positive Rate")
            ax.set_ylabel("True Positive Rate")
            ax.legend(loc="lower right", fontsize="x-small")
            plt.tight_layout()
            save_fig(outdir, "image_roc_curve_variation")
Example #4
0
def main():

    # Argument parser
    parser = make_argparser()
    parser.add_argument("--regressor_dir",
                        default="./",
                        help="regressors directory")
    parser.add_argument("--classifier_dir",
                        default="./",
                        help="regressors directory")
    parser.add_argument(
        "--force_tailcut_for_extended_cleaning",
        type=str2bool,
        default=False,
        help="For tailcut cleaning for energy/score estimation",
    )
    parser.add_argument(
        "--save_images",
        action="store_true",
        help="Save images in images.h5 (one file testing)",
    )
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    # Read site layout
    site = cfg["General"]["site"]
    array = cfg["General"]["array"]
    cameras = cfg["General"]["cam_id_list"]

    # Add force_tailcut_for_extended_cleaning in configuration
    cfg["General"][
        "force_tailcut_for_extended_cleaning"] = args.force_tailcut_for_extended_cleaning
    cfg["General"]["force_mode"] = "tail"
    force_mode = args.mode
    if cfg["General"]["force_tailcut_for_extended_cleaning"] is True:
        force_mode = "tail"
    print("force_mode={}".format(force_mode))
    print("mode={}".format(args.mode))

    if args.infile_list:
        filenamelist = []
        for f in args.infile_list:
            filenamelist += glob("{}/{}".format(args.indir, f))
        filenamelist.sort()

    if not filenamelist:
        print("no files found; check indir: {}".format(args.indir))
        exit(-1)

    # keeping track of events and where they were rejected
    evt_cutflow = CutFlow("EventCutFlow")
    img_cutflow = CutFlow("ImageCutFlow")

    # Event preparer
    preper = EventPreparer(config=cfg,
                           mode=args.mode,
                           event_cutflow=evt_cutflow,
                           image_cutflow=img_cutflow)

    # Regressor and classifier methods
    regressor_method = cfg["EnergyRegressor"]["method_name"]
    classifier_method = cfg["GammaHadronClassifier"]["method_name"]
    use_proba_for_classifier = cfg["GammaHadronClassifier"]["use_proba"]

    if regressor_method in ["None", "none", None]:
        use_regressor = False
    else:
        use_regressor = True

    if classifier_method in ["None", "none", None]:
        use_classifier = False
    else:
        use_classifier = True

    # Classifiers
    if use_classifier:
        classifier_files = (args.classifier_dir +
                            "/classifier_{mode}_{cam_id}_{classifier}.pkl.gz")
        clf_file = classifier_files.format(
            **{
                "mode": force_mode,
                "wave_args": "mixed",
                "classifier": classifier_method,
                "cam_id": "{cam_id}",
            })
        classifier = EventClassifier.load(clf_file, cam_id_list=cameras)

    # Regressors
    if use_regressor:
        regressor_files = (args.regressor_dir +
                           "/regressor_{mode}_{cam_id}_{regressor}.pkl.gz")
        reg_file = regressor_files.format(
            **{
                "mode": force_mode,
                "wave_args": "mixed",
                "regressor": regressor_method,
                "cam_id": "{cam_id}",
            })
        regressor = EnergyRegressor.load(reg_file, cam_id_list=cameras)

    # catch ctr-c signal to exit current loop and still display results
    signal_handler = SignalHandler()
    signal.signal(signal.SIGINT, signal_handler)

    # Declaration of the column descriptor for the (possible) images file
    class StoredImages(tb.IsDescription):
        event_id = tb.Int32Col(dflt=1, pos=0)
        tel_id = tb.Int16Col(dflt=1, pos=1)
        dl1_phe_image = tb.Float32Col(shape=(1855), pos=2)
        mc_phe_image = tb.Float32Col(shape=(1855), pos=3)

    # this class defines the reconstruction parameters to keep track of
    class RecoEvent(tb.IsDescription):
        obs_id = tb.Int16Col(dflt=-1, pos=0)
        event_id = tb.Int32Col(dflt=-1, pos=1)
        NTels_trig = tb.Int16Col(dflt=0, pos=2)
        NTels_reco = tb.Int16Col(dflt=0, pos=3)
        NTels_reco_lst = tb.Int16Col(dflt=0, pos=4)
        NTels_reco_mst = tb.Int16Col(dflt=0, pos=5)
        NTels_reco_sst = tb.Int16Col(dflt=0, pos=6)
        mc_energy = tb.Float32Col(dflt=np.nan, pos=7)
        reco_energy = tb.Float32Col(dflt=np.nan, pos=8)
        reco_alt = tb.Float32Col(dflt=np.nan, pos=9)
        reco_az = tb.Float32Col(dflt=np.nan, pos=10)
        offset = tb.Float32Col(dflt=np.nan, pos=11)
        xi = tb.Float32Col(dflt=np.nan, pos=12)
        ErrEstPos = tb.Float32Col(dflt=np.nan, pos=13)
        ErrEstDir = tb.Float32Col(dflt=np.nan, pos=14)
        gammaness = tb.Float32Col(dflt=np.nan, pos=15)
        success = tb.BoolCol(dflt=False, pos=16)
        score = tb.Float32Col(dflt=np.nan, pos=17)
        h_max = tb.Float32Col(dflt=np.nan, pos=18)
        reco_core_x = tb.Float32Col(dflt=np.nan, pos=19)
        reco_core_y = tb.Float32Col(dflt=np.nan, pos=20)
        mc_core_x = tb.Float32Col(dflt=np.nan, pos=21)
        mc_core_y = tb.Float32Col(dflt=np.nan, pos=22)

    reco_outfile = tb.open_file(
        mode="w",
        # if no outfile name is given (i.e. don't to write the event list to disk),
        # need specify two "driver" arguments
        **({
            "filename": args.outfile
        } if args.outfile else {
            "filename": "no_outfile.h5",
            "driver": "H5FD_CORE",
            "driver_core_backing_store": False,
        }))

    reco_table = reco_outfile.create_table("/", "reco_events", RecoEvent)
    reco_event = reco_table.row

    # Create the images file only if the user want to store the images
    if args.save_images is True:
        images_outfile = tb.open_file("images.h5", mode="w")
        images_table = {}
        images_phe = {}

    # Telescopes in analysis
    allowed_tels = set(prod3b_tel_ids(array, site=site))
    for i, filename in enumerate(filenamelist):

        source = event_source(input_url=filename,
                              allowed_tels=allowed_tels,
                              max_events=args.max_events)
        # loop that cleans and parametrises the images and performs the reconstruction
        for (
                event,
                dl1_phe_image,
                mc_phe_image,
                n_pixel_dict,
                hillas_dict,
                hillas_dict_reco,
                n_tels,
                tot_signal,
                max_signals,
                n_cluster_dict,
                reco_result,
                impact_dict,
        ) in preper.prepare_event(source):

            # Angular quantities
            run_array_direction = event.mcheader.run_array_direction

            # Angular separation between true and reco direction
            xi = angular_separation(event.mc.az, event.mc.alt, reco_result.az,
                                    reco_result.alt)

            # Angular separation bewteen the center of the camera and the reco direction.
            offset = angular_separation(
                run_array_direction[0],  # az
                run_array_direction[1],  # alt
                reco_result.az,
                reco_result.alt,
            )

            # Height of shower maximum
            h_max = reco_result.h_max

            if hillas_dict is not None:

                # Estimate particle energy
                if use_regressor is True:
                    energy_tel = np.zeros(len(hillas_dict.keys()))
                    weight_tel = np.zeros(len(hillas_dict.keys()))

                    for idx, tel_id in enumerate(hillas_dict.keys()):
                        cam_id = event.inst.subarray.tel[tel_id].camera.cam_id
                        moments = hillas_dict[tel_id]
                        model = regressor.model_dict[cam_id]

                        # Features to be fed in the regressor
                        features_img = np.array([
                            np.log10(moments.intensity),
                            np.log10(impact_dict[tel_id].value),
                            moments.width.value,
                            moments.length.value,
                            h_max.value,
                        ])

                        energy_tel[idx] = model.predict([features_img])
                        weight_tel[idx] = moments.intensity

                    reco_energy = np.sum(
                        weight_tel * energy_tel) / sum(weight_tel)
                else:
                    reco_energy = np.nan

                # Estimate particle score/gammaness
                if use_classifier is True:
                    score_tel = np.zeros(len(hillas_dict.keys()))
                    gammaness_tel = np.zeros(len(hillas_dict.keys()))
                    weight_tel = np.zeros(len(hillas_dict.keys()))

                    for idx, tel_id in enumerate(hillas_dict.keys()):
                        cam_id = event.inst.subarray.tel[tel_id].camera.cam_id
                        moments = hillas_dict[tel_id]
                        model = classifier.model_dict[cam_id]
                        # Features to be fed in the classifier
                        features_img = np.array([
                            np.log10(reco_energy),
                            moments.width.value,
                            moments.length.value,
                            moments.skewness,
                            moments.kurtosis,
                            h_max.value,
                        ])
                        # Output of classifier according to type of classifier
                        if use_proba_for_classifier is False:
                            score_tel[idx] = model.decision_function(
                                [features_img])
                        else:
                            gammaness_tel[idx] = model.predict_proba(
                                [features_img])[:, 1]
                        # Should test other weighting strategy (e.g. power of charge, impact, etc.)
                        # For now, weighting a la Mars
                        weight_tel[idx] = np.sqrt(moments.intensity)

                    # Weight the final decision/proba
                    if use_proba_for_classifier is True:
                        gammaness = np.sum(
                            weight_tel * gammaness_tel) / sum(weight_tel)
                    else:
                        score = np.sum(
                            weight_tel * score_tel) / sum(weight_tel)
                else:
                    score = np.nan
                    gammaness = np.nan

                # Regardless if energy or gammaness is estimated, if the user
                # wants to save the images of the run we do it here
                # (Probably not the most efficient way, but for one file is ok)
                if args.save_images is True:
                    for idx, tel_id in enumerate(hillas_dict.keys()):
                        cam_id = event.inst.subarray.tel[tel_id].camera.cam_id
                        if cam_id not in images_phe:
                            images_table[cam_id] = images_outfile.create_table(
                                "/", "_".join(["images", cam_id]),
                                StoredImages)
                            images_phe[cam_id] = images_table[cam_id].row

                shower = event.mc
                mc_core_x = shower.core_x
                mc_core_y = shower.core_y

                reco_core_x = reco_result.core_x
                reco_core_y = reco_result.core_y

                alt, az = reco_result.alt, reco_result.az

                # Fill table's attributes
                reco_event["NTels_trig"] = len(event.dl0.tels_with_data)
                reco_event["NTels_reco"] = len(hillas_dict)
                reco_event["NTels_reco_lst"] = n_tels["LST_LST_LSTCam"]
                reco_event["NTels_reco_mst"] = n_tels["MST_MST_NectarCam"]
                reco_event["NTels_reco_sst"] = n_tels["SST"]  # will change
                reco_event["reco_energy"] = reco_energy
                reco_event["reco_alt"] = alt.to("deg").value
                reco_event["reco_az"] = az.to("deg").value
                reco_event["offset"] = offset.to("deg").value
                reco_event["xi"] = xi.to("deg").value
                reco_event["h_max"] = h_max.to("m").value
                reco_event["reco_core_x"] = reco_core_x.to("m").value
                reco_event["reco_core_y"] = reco_core_y.to("m").value
                reco_event["mc_core_x"] = mc_core_x.to("m").value
                reco_event["mc_core_y"] = mc_core_y.to("m").value
                if use_proba_for_classifier is True:
                    reco_event["gammaness"] = gammaness
                else:
                    reco_event["score"] = score
                reco_event["success"] = True
                reco_event["ErrEstPos"] = np.nan
                reco_event["ErrEstDir"] = np.nan
            else:
                reco_event["success"] = False

            # save basic event infos
            reco_event["mc_energy"] = event.mc.energy.to("TeV").value
            reco_event["event_id"] = event.r1.event_id
            reco_event["obs_id"] = event.r1.obs_id

            if args.save_images is True:
                images_phe[cam_id]["event_id"] = event.r0.event_id
                images_phe[cam_id]["tel_id"] = tel_id
                images_phe[cam_id]["dl1_phe_image"] = dl1_phe_image
                images_phe[cam_id]["mc_phe_image"] = mc_phe_image

                images_phe[cam_id].append()

            # Fill table
            reco_table.flush()
            reco_event.append()

            if signal_handler.stop:
                break
        if signal_handler.stop:
            break

    # make sure everything gets written out nicely
    reco_table.flush()

    if args.save_images is True:
        for table in images_table.values():
            table.flush()

    # Add in meta-data's table?
    try:
        print()
        evt_cutflow()
        print()
        img_cutflow()

    except ZeroDivisionError:
        pass

    print("Job done!")
Example #5
0
def main():
    # Read arguments
    parser = argparse.ArgumentParser(description='Make diagnostic plot')
    parser.add_argument('--config_file', type=str, required=True)
    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument('--wave',
                            dest="mode",
                            action='store_const',
                            const="wave",
                            default="tail",
                            help="if set, use wavelet cleaning")
    mode_group.add_argument(
        '--tail',
        dest="mode",
        action='store_const',
        const="tail",
        help="if set, use tail cleaning, otherwise wavelets")
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    model_type = cfg['General']['model_type']

    # Import parameters
    indir = cfg['General']['outdir']

    cam_ids = cfg['General']['cam_id_list']

    # Model
    method_name = cfg['Method']['name']
    target_name = cfg['Method']['target_name']
    if model_type in 'classifier':
        use_proba = cfg['Method']['use_proba']

    # Diagnostic
    nbins = cfg['Diagnostic']['energy']['nbins']
    energy_edges = np.logspace(np.log10(cfg['Diagnostic']['energy']['min']),
                               np.log10(cfg['Diagnostic']['energy']['max']),
                               nbins + 1, True)

    # Will be further used to get model output of events
    diagnostic = dict()

    for idx, cam_id in enumerate(cam_ids):
        print('### Model diagnostic for {}'.format(cam_id))

        # Load data
        data_scikit = load_obj(
            path.join(
                indir, 'data_scikit_{}_{}_{}_{}.pkl.gz'.format(
                    model_type, method_name, args.mode, cam_id)))
        data_train = pd.read_pickle(
            path.join(
                indir,
                'data_train_{}_{}_{}_{}.pkl.gz'.format(model_type, method_name,
                                                       args.mode, cam_id)))
        data_test = pd.read_pickle(
            path.join(
                indir,
                'data_test_{}_{}_{}_{}.pkl.gz'.format(model_type, method_name,
                                                      args.mode, cam_id)))

        # Load model
        outname = '{}_{}_{}_{}.pkl.gz'.format(model_type, args.mode, cam_id,
                                              method_name)
        model = joblib.load(path.join(indir, outname))

        outdir = os.path.join(
            indir, 'diagnostic_{}_{}_{}_{}'.format(model_type, method_name,
                                                   args.mode, cam_id))
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        if model_type in 'regressor':
            diagnostic[cam_id] = RegressorDiagnostic(
                model=model,
                feature_name_list=cfg['FeatureList'],
                target_name=target_name,
                data_train=data_train,
                data_test=data_test,
                output_name='reco_energy')
        elif model_type in 'classifier':

            if use_proba is True:
                ouput_model_name = 'gammaness'
            else:
                ouput_model_name = 'score'

            diagnostic[cam_id] = ClassifierDiagnostic(
                model=model,
                feature_name_list=cfg['FeatureList'],
                target_name=target_name,
                data_train=data_train,
                data_test=data_test,
                model_output_name=ouput_model_name,
                is_output_proba=use_proba)

        # Image-level diagnostic - feature importance
        plt.figure(figsize=(5, 5))
        ax = plt.gca()
        ax = diagnostic[cam_id].plot_feature_importance(
            ax, **{
                'alpha': 0.7,
                'edgecolor': 'black',
                'linewidth': 2,
                'color': 'darkgreen'
            })
        ax.set_ylabel('Feature importance')
        ax.grid()
        plt.title(cam_id)
        plt.tight_layout()
        plt.savefig(path.join(outdir, 'feature_importances.pdf'))

        # Diagnostic for regressor
        if model_type in 'regressor':

            # Image-level diagnostic[cam_id] - features
            fig, axes = diagnostic[cam_id].plot_features(
                data_list=[data_train, data_test],
                nbin=30,
                hist_kwargs_list=[{
                    'edgecolor': 'blue',
                    'color': 'blue',
                    'label': 'Gamma training',
                    'alpha': 0.2,
                    'fill': True,
                    'ls': '-',
                    'lw': 2
                }, {
                    'edgecolor': 'blue',
                    'color': 'blue',
                    'label': 'Gamma test',
                    'alpha': 1,
                    'fill': False,
                    'ls': '--',
                    'lw': 2
                }],
                error_kw_list=[
                    dict(ecolor='blue', lw=2, capsize=2, capthick=2,
                         alpha=0.2),
                    dict(ecolor='blue', lw=2, capsize=2, capthick=2, alpha=0.2)
                ],
                ncols=3)
            plt.title(cam_id)
            fig.tight_layout()
            fig.savefig(path.join(outdir, 'features.pdf'))

            # Compute averaged energy
            print('Process test sample...')
            data_test_evt = get_evt_subarray_model_output(
                data_test,
                weight_name='sum_signal_cam',
                keep_cols=['mc_energy'],
                model_output_name='reco_energy_img',
                model_output_name_evt='reco_energy')

            ncols = 5
            nrows = int(nbins / ncols) if nbins % ncols == 0 else int(
                (nbins + 1) / ncols)
            if nrows == 0:
                nrows = 1
                ncols = 1
            fig, axes = plt.subplots(nrows=nrows,
                                     ncols=ncols,
                                     figsize=(5 * 5, 10))
            try:
                axes = axes.flatten()
            except:
                axes = [axes]

            bias = []
            resolution = []
            energy_centres = []

            for ibin in range(len(energy_edges) - 1):
                ax = axes[ibin]

                data = data_test_evt.query(
                    'mc_energy >= {} and mc_energy < {}'.format(
                        energy_edges[ibin], energy_edges[ibin + 1]))
                print('Estimate energy for {} evts'.format(len(data)))

                er = data['reco_energy']
                emc = data['mc_energy']

                opt_hist = {
                    'edgecolor': 'black',
                    'color': 'darkgreen',
                    'label': 'data',
                    'alpha': 0.7,
                    'fill': True
                }
                opt_fit = {'c': 'red', 'lw': 2, 'label': 'Best fit'}
                ax, fit_param, cov = diagnostic[
                    cam_id].plot_resolution_distribution(ax=ax,
                                                         y_true=emc,
                                                         y_reco=er,
                                                         nbin=50,
                                                         fit_range=[-2, 2],
                                                         hist_kwargs=opt_hist,
                                                         fit_kwargs=opt_fit)
                if fit_param[2] < 0:  # negative value are allowed for the fit
                    fit_param[2] *= -1

                label = '[{:.2f},{:.2f}] TeV\n#Evts={}\nmean={:.2f}\nstd={:.2f}'.format(
                    energy_edges[ibin], energy_edges[ibin + 1], len(er),
                    fit_param[1], fit_param[2])

                ax.set_ylabel('# Evts')
                ax.set_xlabel('(ereco-emc) / emc')
                ax.set_xlim([-2, 2])
                ax.grid()

                evt_patch = mpatches.Patch(color='white', label=label)
                data_patch = mpatches.Patch(color='blue', label='data')
                fit_patch = mpatches.Patch(color='red', label='best fit')
                ax.legend(loc='best',
                          handles=[evt_patch, data_patch, fit_patch])
                plt.tight_layout()

                print(' Fit results: ({:.3f},{:.3f} TeV)'.format(
                    energy_edges[ibin], energy_edges[ibin + 1]))

                try:
                    print(' - A    : {:.3f} +/- {:.3f}'.format(
                        fit_param[0], cov[0][0]))
                    print(' - mean : {:.3f} +/- {:.3f}'.format(
                        fit_param[1], cov[1][1]))
                    print(' - std  : {:.3f} +/- {:.3f}'.format(
                        fit_param[2], cov[2][2]))
                except:
                    print(' ==> Problem with fit, no covariance...'.format())
                    continue

                bias.append(fit_param[1])
                resolution.append(fit_param[2])
                energy_centres.append(
                    (energy_edges[ibin] + energy_edges[ibin + 1]) / 2.)

            plt.savefig(path.join(outdir, 'migration_distribution.pdf'))

            plt.figure(figsize=(5, 5))
            ax = plt.gca()
            ax.plot(energy_centres,
                    resolution,
                    marker='s',
                    color='darkorange',
                    label='Resolution')
            ax.plot(energy_centres,
                    bias,
                    marker='s',
                    color='darkgreen',
                    label='Bias')
            ax.set_xlabel('True energy [TeV]')
            ax.set_ylabel('Energy resolution')
            ax.set_xscale('log')
            ax.grid()
            ax.legend()
            ax.set_ylim([-0.2, 1.2])
            plt.title(cam_id)
            plt.tight_layout()
            plt.savefig(path.join(outdir, 'energy_resolution.pdf'))

            # Write results
            t = Table()
            t['ENERGY'] = Column(energy_centres,
                                 unit='TeV',
                                 description='Energy centers')
            t['BIAS'] = Column(bias,
                               unit='',
                               description='Bias from gauusian fit')
            t['RESOL'] = Column(bias,
                                unit='',
                                description='Resolution from gauusian fit')
            t.write(os.path.join(outdir, 'energy_resolution.fits'),
                    format='fits',
                    overwrite=True)

        elif model_type in 'classifier':

            # Image-level diagnostic - features
            fig, axes = diagnostic[cam_id].plot_features(
                data_list=[
                    data_train.query('label==1'),
                    data_test.query('label==1'),
                    data_train.query('label==0'),
                    data_test.query('label==0')
                ],
                nbin=30,
                hist_kwargs_list=[{
                    'edgecolor': 'blue',
                    'color': 'blue',
                    'label': 'Gamma training sample',
                    'alpha': 0.2,
                    'fill': True,
                    'ls': '-',
                    'lw': 2
                }, {
                    'edgecolor': 'blue',
                    'color': 'blue',
                    'label': 'Gamma test sample',
                    'alpha': 1,
                    'fill': False,
                    'ls': '--',
                    'lw': 2
                }, {
                    'edgecolor': 'red',
                    'color': 'red',
                    'label': 'Proton training sample',
                    'alpha': 0.2,
                    'fill': True,
                    'ls': '-',
                    'lw': 2
                }, {
                    'edgecolor': 'red',
                    'color': 'red',
                    'label': 'Proton test sample',
                    'alpha': 1,
                    'fill': False,
                    'ls': '--',
                    'lw': 2
                }],
                error_kw_list=[
                    dict(ecolor='blue', lw=2, capsize=3, capthick=2,
                         alpha=0.2),
                    dict(ecolor='blue', lw=2, capsize=3, capthick=2, alpha=1),
                    dict(ecolor='red', lw=2, capsize=3, capthick=2, alpha=0.2),
                    dict(ecolor='red', lw=2, capsize=3, capthick=2, alpha=1)
                ],
                ncols=3)
            plt.title(cam_id)
            fig.tight_layout()
            fig.savefig(path.join(outdir, 'features.pdf'))

            if method_name in 'AdaBoostClassifier':
                # Image-level diagnostic - method
                plt.figure(figsize=(5, 5))
                ax = plt.gca()
                opt = {'color': 'darkgreen', 'ls': '-', 'lw': 2}
                BoostedDecisionTreeDiagnostic.plot_error_rate(
                    ax, model, data_scikit, **opt)
                plt.title(cam_id)
                plt.tight_layout()
                plt.savefig(
                    os.path.join(outdir, 'bdt_diagnostic_error_rate.pdf'))

                plt.figure(figsize=(5, 5))
                ax = plt.gca()
                BoostedDecisionTreeDiagnostic.plot_tree_error_rate(
                    ax, model, **opt)
                plt.title(cam_id)
                plt.tight_layout()
                plt.savefig(
                    os.path.join(outdir, 'bdt_diagnostic_tree_error_rate.pdf'))

            # Image-level diagnostic - model output
            fig, ax = diagnostic[cam_id].plot_image_model_output_distribution(
                nbin=50)
            ax[0].set_xlim([0, 1])
            plt.title(cam_id)
            fig.tight_layout()
            fig.savefig(os.path.join(outdir, 'image_distribution.pdf'))

            # Image-level diagnostic - ROC curve on train and test samples
            plt.figure(figsize=(5, 5))
            ax = plt.gca()
            plot_roc_curve(
                ax, diagnostic[cam_id].data_train[
                    diagnostic[cam_id].model_output_name],
                diagnostic[cam_id].data_train['label'],
                **dict(color='darkgreen', lw=2, label='Training sample'))
            plot_roc_curve(
                ax, data_test[diagnostic[cam_id].model_output_name],
                diagnostic[cam_id].data_test['label'],
                **dict(color='darkorange', lw=2, label='Test sample'))
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            ax.legend(loc='lower right')
            plt.title(cam_id)
            plt.tight_layout()
            plt.savefig(os.path.join(outdir, 'image_roc_curve.pdf'))

            # Parameters for energy variation
            cut_list = [
                'reco_energy >= {:.2f} and reco_energy <= {:.2f}'.format(
                    energy_edges[i], energy_edges[i + 1])
                for i in range(len(energy_edges) - 1)
            ]

            hist_kwargs_list = [{
                'edgecolor': 'blue',
                'color': 'blue',
                'label': 'Gamma training sample',
                'alpha': 0.2,
                'fill': True,
                'ls': '-',
                'lw': 2
            }, {
                'edgecolor': 'blue',
                'color': 'blue',
                'label': 'Gamma test sample',
                'alpha': 1,
                'fill': False,
                'ls': '--',
                'lw': 2
            }, {
                'edgecolor': 'red',
                'color': 'red',
                'label': 'Proton training sample',
                'alpha': 0.2,
                'fill': True,
                'ls': '-',
                'lw': 2
            }, {
                'edgecolor': 'red',
                'color': 'red',
                'label': 'Proton test sample',
                'alpha': 1,
                'fill': False,
                'ls': '--',
                'lw': 2
            }]

            error_kw_list = [
                dict(ecolor='blue', lw=2, capsize=3, capthick=2, alpha=0.2),
                dict(ecolor='blue', lw=2, capsize=3, capthick=2, alpha=1),
                dict(ecolor='red', lw=2, capsize=3, capthick=2, alpha=0.2),
                dict(ecolor='red', lw=2, capsize=3, capthick=2, alpha=1)
            ]

            # Image-level diagnostic - model output distribution variation
            n_feature = len(cut_list)
            ncols = 2
            nrows = int(n_feature / ncols) if n_feature % ncols == 0 else int(
                (n_feature + 1) / ncols)
            fig, axes = plt.subplots(nrows=nrows,
                                     ncols=ncols,
                                     figsize=(5 * ncols, 3 * nrows))
            if nrows == 1 and ncols == 1:
                axes = [axes]
            else:
                axes = axes.flatten()

            data_list = [
                data_train.query('label==1'),
                data_test.query('label==1'),
                data_train.query('label==0'),
                data_test.query('label==0')
            ]

            for i, colname in enumerate(cut_list):
                ax = axes[i]

                # Range for binning
                the_range = [0, 1]

                for j, data in enumerate(data_list):
                    if len(data) == 0:
                        continue

                    ax = plot_hist(ax=ax,
                                   data=data.query(
                                       cut_list[i])[ouput_model_name],
                                   nbin=30,
                                   limit=the_range,
                                   norm=True,
                                   yerr=True,
                                   hist_kwargs=hist_kwargs_list[j],
                                   error_kw=error_kw_list[j])

                ax.set_xlim(the_range)
                ax.set_xlabel(ouput_model_name)
                ax.set_ylabel('Arbitrary units')
                ax.legend(loc='best', fontsize='x-small')
                ax.set_title(cut_list[i])
                ax.grid()
            fig.tight_layout()
            fig.savefig(path.join(outdir, 'image_distribution_variation.pdf'))

            # Image-level diagnostic - ROC curve variation on test sample
            plt.figure(figsize=(5, 5))
            ax = plt.gca()

            color = 1.
            step_color = 1. / (len(cut_list))
            for i, cut in enumerate(cut_list):
                c = color - (i + 1) * step_color

                data = data_test.query(cut)
                if len(data) == 0:
                    continue

                opt = dict(color=str(c),
                           lw=2,
                           label='{}'.format(cut.replace('reco_energy', 'E')))
                plot_roc_curve(ax, data[ouput_model_name], data['label'],
                               **opt)
            ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            ax.set_title(cam_id)
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.legend(loc="lower right", fontsize='x-small')
            plt.tight_layout()
            plt.savefig(os.path.join(outdir, 'image_roc_curve_variation.pdf'))
Example #6
0
def main():

    # Argument parser
    parser = make_argparser()

    parser.add_argument(
        "--debug",
        action="store_true",
        help="Print debugging information",
    )

    parser.add_argument(
        "--save_images",
        action="store_true",
        help="Save also all images",
    )

    parser.add_argument(
        "--estimate_energy",
        type=str2bool,
        default=False,
        help="Estimate the events' energy with a regressor from\
         protopipe.scripts.build_model",
    )
    parser.add_argument("--regressor_dir",
                        type=str,
                        default="./",
                        help="regressors directory")
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    try:  # If the user didn't specify a site and/or and array...
        site = cfg["General"]["site"]
        array = cfg["General"]["array"]
    except KeyError:  # ...raise an error and exit.
        print("\033[91m ERROR: make sure that both 'site' and 'array' are "
              "specified in the analysis configuration file! \033[0m")
        exit()

    if args.infile_list:
        filenamelist = []
        for f in args.infile_list:
            filenamelist += glob("{}/{}".format(args.indir, f))
        filenamelist.sort()
    else:
        raise ValueError("don't know which input to use...")

    if not filenamelist:
        print("no files found; check indir: {}".format(args.indir))
        exit(-1)
    else:
        print("found {} files".format(len(filenamelist)))

    # Get the IDs of the involved telescopes and associated cameras together
    # with the equivalent focal lengths from the first event
    allowed_tels, cams_and_foclens, subarray = prod3b_array(
        filenamelist[0], site, array)

    # keeping track of events and where they were rejected
    evt_cutflow = CutFlow("EventCutFlow")
    img_cutflow = CutFlow("ImageCutFlow")

    preper = EventPreparer(
        config=cfg,
        subarray=subarray,
        cams_and_foclens=cams_and_foclens,
        mode=args.mode,
        event_cutflow=evt_cutflow,
        image_cutflow=img_cutflow,
    )

    # catch ctr-c signal to exit current loop and still display results
    signal_handler = SignalHandler()
    signal.signal(signal.SIGINT, signal_handler)

    # Regressor method
    regressor_method = cfg["EnergyRegressor"]["method_name"]

    # wrapper for the scikit-learn regressor
    if args.estimate_energy is True:
        regressor_files = (args.regressor_dir +
                           "/regressor_{mode}_{cam_id}_{regressor}.pkl.gz")
        reg_file = regressor_files.format(
            **{
                "mode": args.mode,
                "wave_args": "mixed",  # ToDo, control
                "regressor": regressor_method,
                "cam_id": "{cam_id}",
            })

        regressor = EnergyRegressor.load(reg_file,
                                         cam_id_list=cams_and_foclens.keys())

    # COLUMN DESCRIPTOR AS DICTIONARY
    # Column descriptor for the file containing output training data."""
    DataTrainingOutput = dict(
        # ======================================================================
        # ARRAY
        obs_id=tb.Int16Col(dflt=1, pos=0),
        event_id=tb.Int32Col(dflt=1, pos=1),
        tel_id=tb.Int16Col(dflt=1, pos=2),
        N_LST=tb.Int16Col(dflt=1, pos=3),
        N_MST=tb.Int16Col(dflt=1, pos=4),
        N_SST=tb.Int16Col(dflt=1, pos=5),
        n_tel_reco=tb.FloatCol(dflt=1, pos=6),
        n_tel_discri=tb.FloatCol(dflt=1, pos=7),
        # ======================================================================
        # DL1
        hillas_intensity_reco=tb.Float32Col(dflt=1, pos=8),
        hillas_intensity=tb.Float32Col(dflt=1, pos=9),
        hillas_x_reco=tb.Float32Col(dflt=1, pos=10),
        hillas_y_reco=tb.Float32Col(dflt=1, pos=11),
        hillas_x=tb.Float32Col(dflt=1, pos=12),
        hillas_y=tb.Float32Col(dflt=1, pos=13),
        hillas_r_reco=tb.Float32Col(dflt=1, pos=14),
        hillas_r=tb.Float32Col(dflt=1, pos=15),
        hillas_phi_reco=tb.Float32Col(dflt=1, pos=16),
        hillas_phi=tb.Float32Col(dflt=1, pos=17),
        hillas_length_reco=tb.Float32Col(dflt=1, pos=18),
        hillas_length=tb.Float32Col(dflt=1, pos=19),
        hillas_width_reco=tb.Float32Col(dflt=1, pos=20),
        hillas_width=tb.Float32Col(dflt=1, pos=21),
        hillas_psi_reco=tb.Float32Col(dflt=1, pos=22),
        hillas_psi=tb.Float32Col(dflt=1, pos=23),
        hillas_skewness_reco=tb.Float32Col(dflt=1, pos=24),
        hillas_skewness=tb.Float32Col(dflt=1, pos=25),
        hillas_kurtosis=tb.Float32Col(dflt=1, pos=26),
        hillas_kurtosis_reco=tb.Float32Col(dflt=1, pos=27),
        leakage_intensity_width_1_reco=tb.Float32Col(dflt=np.nan, pos=28),
        leakage_intensity_width_2_reco=tb.Float32Col(dflt=np.nan, pos=29),
        leakage_intensity_width_1=tb.Float32Col(dflt=np.nan, pos=30),
        leakage_intensity_width_2=tb.Float32Col(dflt=np.nan, pos=31),
        # The following are missing from current ctapipe DL1 output
        # Not sure if it's worth to add them
        hillas_ellipticity_reco=tb.FloatCol(dflt=1, pos=32),
        hillas_ellipticity=tb.FloatCol(dflt=1, pos=33),
        max_signal_cam=tb.Float32Col(dflt=1, pos=34),
        pixels=tb.Int16Col(dflt=1, pos=35),
        clusters=tb.Int16Col(dflt=-1, pos=36),
        # ======================================================================
        # DL2 - DIRECTION RECONSTRUCTION
        impact_dist=tb.Float32Col(dflt=1, pos=37),
        h_max=tb.Float32Col(dflt=1, pos=38),
        alt=tb.Float32Col(dflt=np.nan, pos=39),
        az=tb.Float32Col(dflt=np.nan, pos=40),
        err_est_pos=tb.Float32Col(dflt=1, pos=41),
        err_est_dir=tb.Float32Col(dflt=1, pos=42),
        xi=tb.Float32Col(dflt=np.nan, pos=43),
        offset=tb.Float32Col(dflt=np.nan, pos=44),
        mc_core_x=tb.FloatCol(dflt=1, pos=45),
        mc_core_y=tb.FloatCol(dflt=1, pos=46),
        reco_core_x=tb.FloatCol(dflt=1, pos=47),
        reco_core_y=tb.FloatCol(dflt=1, pos=48),
        mc_h_first_int=tb.FloatCol(dflt=1, pos=49),
        mc_x_max=tb.Float32Col(dflt=np.nan, pos=50),
        is_valid=tb.BoolCol(dflt=False, pos=51),
        good_image=tb.Int16Col(dflt=1, pos=52),
        # ======================================================================
        # DL2 - ENERGY ESTIMATION
        true_energy=tb.FloatCol(dflt=1, pos=53),
        reco_energy=tb.FloatCol(dflt=np.nan, pos=54),
        reco_energy_tel=tb.Float32Col(dflt=np.nan, pos=55),
        # ======================================================================
        # DL1 IMAGES
        # this is optional data saved by the user
        # since these data declarations require to know how many pixels
        # each saved image will have,
        # we add them later on, right before creating the table
        # We list them here for reference
        # true_image=tb.Float32Col(shape=(1855), pos=56),
        # reco_image=tb.Float32Col(shape=(1855), pos=57),
        # cleaning_mask_reco=tb.BoolCol(shape=(1855), pos=58),  # not in ctapipe
    )

    outfile = tb.open_file(args.outfile, mode="w")
    outTable = {}
    outData = {}

    for i, filename in enumerate(filenamelist):

        print("file: {} filename = {}".format(i, filename))

        source = event_source(input_url=filename,
                              allowed_tels=allowed_tels,
                              max_events=args.max_events)

        # loop that cleans and parametrises the images and performs the
        # reconstruction for each event
        for (
                event,
                reco_image,
                cleaning_mask_reco,
                cleaning_mask_clusters,
                true_image,
                n_pixel_dict,
                hillas_dict,
                hillas_dict_reco,
                leakage_dict,
                n_tels,
                max_signals,
                n_cluster_dict,
                reco_result,
                impact_dict,
                good_event,
                good_for_reco,
        ) in preper.prepare_event(source,
                                  save_images=args.save_images,
                                  debug=args.debug):

            # Angular quantities
            run_array_direction = event.mcheader.run_array_direction

            if good_event:

                xi = angular_separation(event.mc.az, event.mc.alt,
                                        reco_result.az, reco_result.alt)

                offset = angular_separation(
                    run_array_direction[0],  # az
                    run_array_direction[1],  # alt
                    reco_result.az,
                    reco_result.alt,
                )

                # Impact parameter
                reco_core_x = reco_result.core_x
                reco_core_y = reco_result.core_y

                # Height of shower maximum
                h_max = reco_result.h_max
                # Todo add conversion in number of radiation length,
                # need an atmosphere profile

                is_valid = True

            else:  # something went wrong and the shower's reconstruction failed

                xi = np.nan * u.deg
                offset = np.nan * u.deg
                reco_core_x = np.nan * u.m
                reco_core_y = np.nan * u.m
                h_max = np.nan * u.m
                reco_result.alt = np.nan * u.deg
                reco_result.az = np.nan * u.deg
                is_valid = False

            reco_energy = np.nan
            reco_energy_tel = dict()

            # Not optimal at all, two loop on tel!!!
            # For energy estimation
            # Estimate energy only if the shower was reconstructed
            if (args.estimate_energy is True) and is_valid:
                weight_tel = np.zeros(len(hillas_dict.keys()))
                energy_tel = np.zeros(len(hillas_dict.keys()))

                for idx, tel_id in enumerate(hillas_dict.keys()):

                    # use only images that survived cleaning and
                    # parametrization
                    if not good_for_reco[tel_id]:
                        # bad images will get an undetermined energy
                        # this is a per-telescope energy
                        # NOT the estimated energy for the shower
                        reco_energy_tel[tel_id] = np.nan
                        continue

                    cam_id = source.subarray.tel[tel_id].camera.camera_name
                    moments = hillas_dict[tel_id]
                    model = regressor.model_dict[cam_id]

                    features_img = np.array([
                        np.log10(moments.intensity),
                        np.log10(impact_dict[tel_id].value),
                        moments.width.value,
                        moments.length.value,
                        h_max.value,
                    ])

                    energy_tel[idx] = model.predict([features_img])
                    weight_tel[idx] = moments.intensity
                    reco_energy_tel[tel_id] = energy_tel[idx]

                reco_energy = np.sum(weight_tel * energy_tel) / sum(weight_tel)
            else:
                for idx, tel_id in enumerate(hillas_dict.keys()):
                    reco_energy_tel[tel_id] = np.nan

            for idx, tel_id in enumerate(hillas_dict.keys()):
                cam_id = source.subarray.tel[tel_id].camera.camera_name

                if cam_id not in outData:

                    if args.save_images is True:
                        # we define and save images content here, to make it
                        # adaptive to different cameras

                        n_pixels = source.subarray.tel[
                            tel_id].camera.geometry.n_pixels
                        DataTrainingOutput["true_image"] = tb.Float32Col(
                            shape=(n_pixels), pos=56)
                        DataTrainingOutput["reco_image"] = tb.Float32Col(
                            shape=(n_pixels), pos=57)
                        DataTrainingOutput["cleaning_mask_reco"] = tb.BoolCol(
                            shape=(n_pixels), pos=58)  # not in ctapipe
                        DataTrainingOutput[
                            "cleaning_mask_clusters"] = tb.BoolCol(
                                shape=(n_pixels), pos=58)  # not in ctapipe

                    outTable[cam_id] = outfile.create_table(
                        "/",
                        cam_id,
                        DataTrainingOutput,
                    )
                    outData[cam_id] = outTable[cam_id].row

                moments = hillas_dict[tel_id]
                ellipticity = moments.width / moments.length

                # Write to file also the Hillas parameters that have been used
                # to calculate reco_results

                moments_reco = hillas_dict_reco[tel_id]
                ellipticity_reco = moments_reco.width / moments_reco.length

                outData[cam_id]["good_image"] = good_for_reco[tel_id]
                outData[cam_id]["is_valid"] = is_valid
                outData[cam_id]["impact_dist"] = impact_dict[tel_id].to(
                    "m").value
                outData[cam_id]["max_signal_cam"] = max_signals[tel_id]
                outData[cam_id]["hillas_intensity"] = moments.intensity
                outData[cam_id]["N_LST"] = n_tels["LST_LST_LSTCam"]
                outData[cam_id]["N_MST"] = (n_tels["MST_MST_NectarCam"] +
                                            n_tels["MST_MST_FlashCam"] +
                                            n_tels["MST_SCT_SCTCam"])
                outData[cam_id]["N_SST"] = (n_tels["SST_1M_DigiCam"] +
                                            n_tels["SST_ASTRI_ASTRICam"] +
                                            n_tels["SST_GCT_CHEC"])
                outData[cam_id]["hillas_width"] = moments.width.to("deg").value
                outData[cam_id]["hillas_length"] = moments.length.to(
                    "deg").value
                outData[cam_id]["hillas_psi"] = moments.psi.to("deg").value
                outData[cam_id]["hillas_skewness"] = moments.skewness
                outData[cam_id]["hillas_kurtosis"] = moments.kurtosis
                outData[cam_id]["h_max"] = h_max.to("m").value
                outData[cam_id]["err_est_pos"] = np.nan
                outData[cam_id]["err_est_dir"] = np.nan
                outData[cam_id]["true_energy"] = event.mc.energy.to(
                    "TeV").value
                outData[cam_id]["hillas_x"] = moments.x.to("deg").value
                outData[cam_id]["hillas_y"] = moments.y.to("deg").value
                outData[cam_id]["hillas_phi"] = moments.phi.to("deg").value
                outData[cam_id]["hillas_r"] = moments.r.to("deg").value

                outData[cam_id]["pixels"] = n_pixel_dict[tel_id]
                outData[cam_id]["obs_id"] = event.index.obs_id
                outData[cam_id]["event_id"] = event.index.event_id
                outData[cam_id]["tel_id"] = tel_id
                outData[cam_id]["xi"] = xi.to("deg").value
                outData[cam_id]["reco_energy"] = reco_energy
                outData[cam_id]["hillas_ellipticity"] = ellipticity.value
                outData[cam_id]["clusters"] = n_cluster_dict[tel_id]
                outData[cam_id]["n_tel_discri"] = n_tels["GOOD images"]
                outData[cam_id]["mc_core_x"] = event.mc.core_x.to("m").value
                outData[cam_id]["mc_core_y"] = event.mc.core_y.to("m").value
                outData[cam_id]["reco_core_x"] = reco_core_x.to("m").value
                outData[cam_id]["reco_core_y"] = reco_core_y.to("m").value
                outData[cam_id]["mc_h_first_int"] = event.mc.h_first_int.to(
                    "m").value
                outData[cam_id]["offset"] = offset.to("deg").value
                outData[cam_id]["mc_x_max"] = event.mc.x_max.value  # g / cm2
                outData[cam_id]["alt"] = reco_result.alt.to("deg").value
                outData[cam_id]["az"] = reco_result.az.to("deg").value
                outData[cam_id]["reco_energy_tel"] = reco_energy_tel[tel_id]
                # Variables from hillas_dist_reco
                outData[cam_id]["n_tel_reco"] = n_tels["GOOD images"]
                outData[cam_id]["hillas_x_reco"] = moments_reco.x.to(
                    "deg").value
                outData[cam_id]["hillas_y_reco"] = moments_reco.y.to(
                    "deg").value
                outData[cam_id]["hillas_phi_reco"] = moments_reco.phi.to(
                    "deg").value
                outData[cam_id][
                    "hillas_ellipticity_reco"] = ellipticity_reco.value
                outData[cam_id]["hillas_r_reco"] = moments_reco.r.to(
                    "deg").value
                outData[cam_id]["hillas_skewness_reco"] = moments_reco.skewness
                outData[cam_id]["hillas_kurtosis_reco"] = moments_reco.kurtosis
                outData[cam_id]["hillas_width_reco"] = moments_reco.width.to(
                    "deg").value
                outData[cam_id]["hillas_length_reco"] = moments_reco.length.to(
                    "deg").value
                outData[cam_id]["hillas_psi_reco"] = moments_reco.psi.to(
                    "deg").value
                outData[cam_id][
                    "hillas_intensity_reco"] = moments_reco.intensity
                outData[cam_id][
                    "leakage_intensity_width_1_reco"] = leakage_dict[tel_id][
                        "leak1_reco"]
                outData[cam_id][
                    "leakage_intensity_width_2_reco"] = leakage_dict[tel_id][
                        "leak2_reco"]
                outData[cam_id]["leakage_intensity_width_1"] = leakage_dict[
                    tel_id]["leak1"]
                outData[cam_id]["leakage_intensity_width_2"] = leakage_dict[
                    tel_id]["leak2"]

                # =======================
                # IMAGES INFORMATION
                # =======================

                if args.save_images is True:
                    # we define and save images content here, to make it
                    # adaptive to different cameras

                    outData[cam_id]["true_image"] = true_image[tel_id]
                    outData[cam_id]["reco_image"] = reco_image[tel_id]
                    outData[cam_id]["cleaning_mask_reco"] = cleaning_mask_reco[
                        tel_id]
                    outData[cam_id][
                        "cleaning_mask_clusters"] = cleaning_mask_clusters[
                            tel_id]
                # =======================

                outData[cam_id].append()

            if signal_handler.stop:
                break
        if signal_handler.stop:
            break
    # make sure that all the events are properly stored
    for table in outTable.values():
        table.flush()

    print(bcolors.BOLD +
          "\n\n==================================================\n" +
          "Statistical summary of processed events and images\n" +
          "==================================================\n"
          # + bcolors.ENDC
          )

    evt_cutflow()

    # Catch specific cases
    triggered_events = evt_cutflow.cuts["min2Tels trig"][1]
    reconstructed_events = evt_cutflow.cuts["min2Tels reco"][1]

    if triggered_events == 0:
        print("\033[93mWARNING: No events have been triggered"
              " by the selected telescopes! \033[0m")
    else:
        print("\n")
        img_cutflow()
        if reconstructed_events == 0:
            print("\033[93m WARNING: None of the triggered events have been "
                  "properly reconstructed by the selected telescopes!\n"
                  "DL1 file will be empty! \033[0m")
        print(bcolors.ENDC)
def main():

    # Read arguments
    parser = argparse.ArgumentParser(description='Make performance files')
    parser.add_argument('--config_file', type=str, required=True, help='')

    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument('--wave',
                            dest="mode",
                            action='store_const',
                            const="wave",
                            default="tail",
                            help="if set, use wavelet cleaning")
    mode_group.add_argument('--tail',
                            dest="mode",
                            action='store_const',
                            const="tail",
                            help="if set, use tail cleaning (default)")

    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    # Create output directory if necessary
    outdir = os.path.join(
        cfg['general']['outdir'],
        'performance_protopipe_{}_CTA{}_{}_Zd{}_{}_Time{:.2f}{}'.format(
            cfg['general']['prod'], cfg['general']['site'],
            cfg['general']['array'], cfg['general']['zenith'],
            cfg['general']['azimuth'], cfg['analysis']['obs_time']['value'],
            cfg['analysis']['obs_time']['unit']),
    )

    indir = cfg['general']['indir']
    template_input_file = cfg['general']['template_input_file']

    T_OBS = cfg['analysis']['obs_time']['value'] * u.Unit(
        cfg['analysis']['obs_time']['unit'])

    # scaling between on and off region.
    # Make off region 5 times larger than on region for better
    # background statistics
    ALPHA = cfg['analysis']['alpha']
    # Radius to use for calculating bg rate
    MAX_BG_RADIUS = cfg['analysis']['max_bg_radius'] * u.deg

    particles = {
        "gamma": {
            "file":
            os.path.join(indir, template_input_file.format(args.mode,
                                                           "gamma")),
            "target_spectrum":
            CRAB_HEGRA,
            "run_header":
            cfg['particle_information']['gamma']
        },
        "proton": {
            "file":
            os.path.join(indir,
                         template_input_file.format(args.mode, "proton")),
            "target_spectrum":
            IRFDOC_PROTON_SPECTRUM,
            "run_header":
            cfg['particle_information']['proton']
        },
        "electron": {
            "file":
            os.path.join(indir,
                         template_input_file.format(args.mode, "electron")),
            "target_spectrum":
            IRFDOC_ELECTRON_SPECTRUM,
            "run_header":
            cfg['particle_information']['electron']
        },
    }

    logging.basicConfig(level=logging.INFO)
    logging.getLogger("pyirf").setLevel(logging.DEBUG)

    for particle_type, p in particles.items():
        log.info(f"Simulated {particle_type.title()} Events:")
        p["events"], p["simulation_info"] = read_DL2_pyirf(
            p["file"], p["run_header"])

        # Multiplicity cut
        p["events"] = p["events"][
            p["events"]["multiplicity"] >= cfg['analysis']
            ['cut_on_multiplicity']].copy()

        p["simulated_spectrum"] = PowerLaw.from_simulation(
            p["simulation_info"], T_OBS)
        # Weight events
        p["events"]["weight"] = calculate_event_weights(
            p["events"]["true_energy"], p["target_spectrum"],
            p["simulated_spectrum"])

        for prefix in ('true', 'reco'):
            k = f"{prefix}_source_fov_offset"
            p["events"][k] = calculate_source_fov_offset(p["events"],
                                                         prefix=prefix)

        # calculate theta / distance between reco and assuemd source positoin
        # we handle only ON observations here, so the assumed source pos
        # is the pointing position
        p["events"]["theta"] = calculate_theta(
            p["events"],
            assumed_source_az=p["events"]["pointing_az"],
            assumed_source_alt=p["events"]["pointing_alt"],
        )
        log.info(p["simulation_info"])
        log.info("")

    gammas = particles["gamma"]["events"]
    # background table composed of both electrons and protons
    background = table.vstack(
        [particles["proton"]["events"], particles["electron"]["events"]])

    MAX_GH_CUT_EFFICIENCY = 0.8
    GH_CUT_EFFICIENCY_STEP = 0.01

    # gh cut used for first calculation of the binned theta cuts
    INITIAL_GH_CUT_EFFICENCY = 0.4

    INITIAL_GH_CUT = np.quantile(gammas['gh_score'],
                                 (1 - INITIAL_GH_CUT_EFFICENCY))
    log.info(
        f"Using fixed G/H cut of {INITIAL_GH_CUT} to calculate theta cuts")

    # event display uses much finer bins for the theta cut than
    # for the sensitivity
    theta_bins = add_overflow_bins(
        create_bins_per_decade(
            10**(-1.9) * u.TeV,
            10**2.3005 * u.TeV,
            50,
        ))

    # theta cut is 68 percent containmente of the gammas
    # for now with a fixed global, unoptimized score cut
    mask_theta_cuts = gammas["gh_score"] >= INITIAL_GH_CUT
    theta_cuts = calculate_percentile_cut(
        gammas["theta"][mask_theta_cuts],
        gammas["reco_energy"][mask_theta_cuts],
        bins=theta_bins,
        min_value=0.05 * u.deg,
        fill_value=0.32 * u.deg,
        max_value=0.32 * u.deg,
        percentile=68,
    )

    # same bins as event display uses
    sensitivity_bins = add_overflow_bins(
        create_bins_per_decade(10**-1.9 * u.TeV,
                               10**2.31 * u.TeV,
                               bins_per_decade=5))

    log.info("Optimizing G/H separation cut for best sensitivity")
    gh_cut_efficiencies = np.arange(
        GH_CUT_EFFICIENCY_STEP,
        MAX_GH_CUT_EFFICIENCY + GH_CUT_EFFICIENCY_STEP / 2,
        GH_CUT_EFFICIENCY_STEP)
    sensitivity_step_2, gh_cuts = optimize_gh_cut(
        gammas,
        background,
        reco_energy_bins=sensitivity_bins,
        gh_cut_efficiencies=gh_cut_efficiencies,
        op=operator.ge,
        theta_cuts=theta_cuts,
        alpha=ALPHA,
        background_radius=MAX_BG_RADIUS,
    )

    # now that we have the optimized gh cuts, we recalculate the theta
    # cut as 68 percent containment on the events surviving these cuts.
    log.info('Recalculating theta cut for optimized GH Cuts')
    for tab in (gammas, background):
        tab["selected_gh"] = evaluate_binned_cut(tab["gh_score"],
                                                 tab["reco_energy"], gh_cuts,
                                                 operator.ge)

    theta_cuts_opt = calculate_percentile_cut(
        gammas[gammas['selected_gh']]["theta"],
        gammas[gammas['selected_gh']]["reco_energy"],
        theta_bins,
        percentile=68,
        fill_value=0.32 * u.deg,
        max_value=0.32 * u.deg,
        min_value=0.05 * u.deg,
    )

    gammas["selected_theta"] = evaluate_binned_cut(gammas["theta"],
                                                   gammas["reco_energy"],
                                                   theta_cuts_opt, operator.le)
    gammas["selected"] = gammas["selected_theta"] & gammas["selected_gh"]

    # calculate sensitivity
    signal_hist = create_histogram_table(gammas[gammas["selected"]],
                                         bins=sensitivity_bins)
    background_hist = estimate_background(
        background[background["selected_gh"]],
        reco_energy_bins=sensitivity_bins,
        theta_cuts=theta_cuts_opt,
        alpha=ALPHA,
        background_radius=MAX_BG_RADIUS,
    )
    sensitivity = calculate_sensitivity(signal_hist,
                                        background_hist,
                                        alpha=ALPHA)

    # scale relative sensitivity by Crab flux to get the flux sensitivity
    spectrum = particles['gamma']['target_spectrum']
    for s in (sensitivity_step_2, sensitivity):
        s["flux_sensitivity"] = (s["relative_sensitivity"] *
                                 spectrum(s['reco_energy_center']))

    log.info('Calculating IRFs')
    hdus = [
        fits.PrimaryHDU(),
        fits.BinTableHDU(sensitivity, name="SENSITIVITY"),
        fits.BinTableHDU(sensitivity_step_2, name="SENSITIVITY_STEP_2"),
        fits.BinTableHDU(theta_cuts, name="THETA_CUTS"),
        fits.BinTableHDU(theta_cuts_opt, name="THETA_CUTS_OPT"),
        fits.BinTableHDU(gh_cuts, name="GH_CUTS"),
    ]

    masks = {
        "": gammas["selected"],
        "_NO_CUTS": slice(None),
        "_ONLY_GH": gammas["selected_gh"],
        "_ONLY_THETA": gammas["selected_theta"],
    }

    # binnings for the irfs
    true_energy_bins = add_overflow_bins(
        create_bins_per_decade(10**-1.9 * u.TeV, 10**2.31 * u.TeV, 10))
    reco_energy_bins = add_overflow_bins(
        create_bins_per_decade(10**-1.9 * u.TeV, 10**2.31 * u.TeV, 5))
    fov_offset_bins = [0, 0.5] * u.deg
    source_offset_bins = np.arange(0, 1 + 1e-4, 1e-3) * u.deg
    energy_migration_bins = np.geomspace(0.2, 5, 200)

    for label, mask in masks.items():
        effective_area = effective_area_per_energy(
            gammas[mask],
            particles["gamma"]["simulation_info"],
            true_energy_bins=true_energy_bins,
        )
        hdus.append(
            create_aeff2d_hdu(
                effective_area[..., np.newaxis],  # +1 dimension for FOV offset
                true_energy_bins,
                fov_offset_bins,
                extname="EFFECTIVE_AREA" + label,
            ))
        edisp = energy_dispersion(
            gammas[mask],
            true_energy_bins=true_energy_bins,
            fov_offset_bins=fov_offset_bins,
            migration_bins=energy_migration_bins,
        )
        hdus.append(
            create_energy_dispersion_hdu(
                edisp,
                true_energy_bins=true_energy_bins,
                migration_bins=energy_migration_bins,
                fov_offset_bins=fov_offset_bins,
                extname="ENERGY_DISPERSION" + label,
            ))

    # Here we use reconstructed energy instead of true energy for the sake of
    # current pipelines comparisons
    bias_resolution = energy_bias_resolution(gammas[gammas["selected"]],
                                             reco_energy_bins,
                                             energy_type="reco")

    # Here we use reconstructed energy instead of true energy for the sake of
    # current pipelines comparisons
    ang_res = angular_resolution(gammas[gammas["selected_gh"]],
                                 reco_energy_bins,
                                 energy_type="reco")

    psf = psf_table(
        gammas[gammas["selected_gh"]],
        true_energy_bins,
        fov_offset_bins=fov_offset_bins,
        source_offset_bins=source_offset_bins,
    )

    background_rate = background_2d(
        background[background['selected_gh']],
        reco_energy_bins,
        fov_offset_bins=np.arange(0, 11) * u.deg,
        t_obs=T_OBS,
    )

    hdus.append(
        create_background_2d_hdu(
            background_rate,
            reco_energy_bins,
            fov_offset_bins=np.arange(0, 11) * u.deg,
        ))
    hdus.append(
        create_psf_table_hdu(
            psf,
            true_energy_bins,
            source_offset_bins,
            fov_offset_bins,
        ))
    hdus.append(
        create_rad_max_hdu(theta_cuts_opt["cut"][:, np.newaxis], theta_bins,
                           fov_offset_bins))
    hdus.append(fits.BinTableHDU(ang_res, name="ANGULAR_RESOLUTION"))
    hdus.append(
        fits.BinTableHDU(bias_resolution, name="ENERGY_BIAS_RESOLUTION"))

    log.info('Writing outputfile')
    fits.HDUList(hdus).writeto(outdir + '.fits.gz', overwrite=True)
Example #8
0
def main():

    # Argument parser
    parser = make_argparser()

    parser.add_argument(
        "--debug",
        action="store_true",
        help="Print debugging information",
    )

    parser.add_argument("--regressor_dir",
                        default="./",
                        help="regressors directory")
    parser.add_argument("--classifier_dir",
                        default="./",
                        help="regressors directory")
    parser.add_argument(
        "--force_tailcut_for_extended_cleaning",
        type=str2bool,
        default=False,
        help="For tailcut cleaning for energy/score estimation",
    )
    parser.add_argument(
        "--save_images",
        action="store_true",
        help="Save images in images.h5 (one file testing)",
    )

    parser.add_argument(
        "--regressor_config",
        type=str,
        default=None,
        help="Configuration file used to produce regressor model")
    parser.add_argument(
        "--classifier_config",
        type=str,
        default=None,
        help="Configuration file used to produce classification model")

    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    try:  # If the user didn't specify a site and/or and array...
        site = cfg["General"]["site"]
        array = cfg["General"]["array"]
    except KeyError:  # ...raise an error and exit.
        print(bcolors.FAIL +
              "ERROR: make sure that both 'site' and 'array' are " +
              "specified in the analysis configuration file!" + bcolors.ENDC)
        exit()

    # Add force_tailcut_for_extended_cleaning in configuration
    cfg["General"][
        "force_tailcut_for_extended_cleaning"] = args.force_tailcut_for_extended_cleaning
    cfg["General"]["force_mode"] = "tail"
    force_mode = args.mode
    if cfg["General"]["force_tailcut_for_extended_cleaning"] is True:
        force_mode = "tail"
    print("force_mode={}".format(force_mode))
    print("mode={}".format(args.mode))

    if args.infile_list:
        filenamelist = []
        for f in args.infile_list:
            filenamelist += glob("{}/{}".format(args.indir, f))
        filenamelist.sort()

    if not filenamelist:
        print("no files found; check indir: {}".format(args.indir))
        exit(-1)

    # Get the IDs of the involved telescopes and associated cameras together
    # with the equivalent focal lengths from the first event
    allowed_tels, cams_and_foclens, subarray = prod3b_array(
        filenamelist[0], site, array)

    # keeping track of events and where they were rejected
    evt_cutflow = CutFlow("EventCutFlow")
    img_cutflow = CutFlow("ImageCutFlow")

    # Event preparer
    preper = EventPreparer(
        config=cfg,
        subarray=subarray,
        cams_and_foclens=cams_and_foclens,
        mode=args.mode,
        event_cutflow=evt_cutflow,
        image_cutflow=img_cutflow,
    )

    # Regressor and classifier methods
    regressor_method = cfg["EnergyRegressor"]["method_name"]
    classifier_method = cfg["GammaHadronClassifier"]["method_name"]
    use_proba_for_classifier = cfg["GammaHadronClassifier"]["use_proba"]

    if regressor_method in ["None", "none", None]:
        print(bcolors.OKBLUE +
              "The energy of the event will NOT be estimated." + bcolors.ENDC)
        use_regressor = False
    else:
        use_regressor = True

    if classifier_method in ["None", "none", None]:
        if args.debug:
            print(bcolors.OKBLUE +
                  "The particle type of the event will NOT be estimated." +
                  bcolors.ENDC)
        use_classifier = False
    else:
        use_classifier = True

    # Classifiers
    if use_classifier:

        # Read configuration file
        classifier_config = load_config(args.classifier_config)

        classifier_files = (args.classifier_dir +
                            "/classifier_{cam_id}_{classifier}.pkl.gz")
        clf_file = classifier_files.format(
            **{
                "mode": force_mode,
                "wave_args": "mixed",
                "classifier": classifier_method,
                "cam_id": "{cam_id}",
            })
        classifiers = load_models(clf_file,
                                  cam_id_list=cams_and_foclens.keys())
        if args.debug:
            print(bcolors.OKBLUE +
                  "The particle type of the event will be estimated" +
                  " using the models stored in" + f" {args.classifier_dir}\n" +
                  bcolors.ENDC)

    # Regressors
    if use_regressor:

        # Read configuration file
        regressor_config = load_config(args.regressor_config)

        regressor_files = (args.regressor_dir +
                           "/regressor_{cam_id}_{regressor}.pkl.gz")
        reg_file = regressor_files.format(
            **{
                "mode": force_mode,
                "wave_args": "mixed",
                "regressor": regressor_method,
                "cam_id": "{cam_id}",
            })
        regressors = load_models(reg_file, cam_id_list=cams_and_foclens.keys())
        if args.debug:
            print(bcolors.OKBLUE +
                  "The energy of the event will be estimated" +
                  " using the models stored in" + f" {args.regressor_dir}\n" +
                  bcolors.ENDC)

    # catch ctr-c signal to exit current loop and still display results
    signal_handler = SignalHandler()
    signal.signal(signal.SIGINT, signal_handler)

    # Declaration of the column descriptor for the (possible) images file
    StoredImages = dict(
        event_id=tb.Int32Col(dflt=1, pos=0),
        tel_id=tb.Int16Col(dflt=1, pos=1)
        # reco_image, true_image and cleaning_mask_reco
        # are defined later sicne they depend on the number of pixels
    )

    # this class defines the reconstruction parameters to keep track of
    class RecoEvent(tb.IsDescription):
        obs_id = tb.Int16Col(dflt=-1, pos=0)
        event_id = tb.Int32Col(dflt=-1, pos=1)
        NTels_trig = tb.Int16Col(dflt=0, pos=2)
        NTels_reco = tb.Int16Col(dflt=0, pos=3)
        NTels_reco_lst = tb.Int16Col(dflt=0, pos=4)
        NTels_reco_mst = tb.Int16Col(dflt=0, pos=5)
        NTels_reco_sst = tb.Int16Col(dflt=0, pos=6)
        pointing_az = tb.Float32Col(dflt=np.nan, pos=7)
        pointing_alt = tb.Float32Col(dflt=np.nan, pos=8)
        true_az = tb.Float32Col(dflt=np.nan, pos=9)
        true_alt = tb.Float32Col(dflt=np.nan, pos=10)
        true_energy = tb.Float32Col(dflt=np.nan, pos=11)
        reco_energy = tb.Float32Col(dflt=np.nan, pos=12)
        reco_alt = tb.Float32Col(dflt=np.nan, pos=13)
        reco_az = tb.Float32Col(dflt=np.nan, pos=14)
        offset = tb.Float32Col(dflt=np.nan, pos=15)
        xi = tb.Float32Col(dflt=np.nan, pos=16)
        ErrEstPos = tb.Float32Col(dflt=np.nan, pos=17)
        ErrEstDir = tb.Float32Col(dflt=np.nan, pos=18)
        gammaness = tb.Float32Col(dflt=np.nan, pos=19)
        success = tb.BoolCol(dflt=False, pos=20)
        score = tb.Float32Col(dflt=np.nan, pos=21)
        h_max = tb.Float32Col(dflt=np.nan, pos=22)
        reco_core_x = tb.Float32Col(dflt=np.nan, pos=23)
        reco_core_y = tb.Float32Col(dflt=np.nan, pos=24)
        true_core_x = tb.Float32Col(dflt=np.nan, pos=25)
        true_core_y = tb.Float32Col(dflt=np.nan, pos=26)
        is_valid = tb.BoolCol(dflt=False, pos=27)

    reco_outfile = tb.open_file(
        mode="w",
        # if no outfile name is given (i.e. don't to write the event list to disk),
        # need specify two "driver" arguments
        **({
            "filename": args.outfile
        } if args.outfile else {
            "filename": "no_outfile.h5",
            "driver": "H5FD_CORE",
            "driver_core_backing_store": False,
        }))

    reco_table = reco_outfile.create_table("/", "reco_events", RecoEvent)
    reco_event = reco_table.row

    # Create the images file only if the user want to store the images
    if args.save_images is True:
        images_outfile = tb.open_file("images.h5", mode="w")
        images_table = {}
        images_phe = {}

    for i, filename in enumerate(filenamelist):

        source = EventSource(input_url=filename,
                             allowed_tels=allowed_tels,
                             max_events=args.max_events)
        # loop that cleans and parametrises the images and performs the reconstruction
        for (
                event,
                reco_image,
                cleaning_mask_reco,
                cleaning_mask_clusters,
                true_image,
                n_pixel_dict,
                hillas_dict,
                hillas_dict_reco,
                leakage_dict,
                n_tels,
                max_signals,
                n_cluster_dict,
                reco_result,
                impact_dict,
                good_event,
                good_for_reco,
        ) in preper.prepare_event(source,
                                  save_images=args.save_images,
                                  debug=args.debug):

            # True direction
            true_az = event.simulation.shower.az
            true_alt = event.simulation.shower.alt

            # Array pointing in AltAz frame
            pointing_az = event.pointing.array_azimuth
            pointing_alt = event.pointing.array_altitude

            if good_event:  # aka it has been successfully reconstructed

                # Angular separation between
                # - true direction
                # - reconstruted direction
                xi = angular_separation(event.simulation.shower.az,
                                        event.simulation.shower.alt,
                                        reco_result.az, reco_result.alt)

                # Angular separation between
                # - center of the array's FoV
                # - reconstructed direction
                offset = angular_separation(
                    pointing_az,
                    pointing_alt,
                    reco_result.az,
                    reco_result.alt,
                )

                # Reconstructed height of shower maximum
                h_max = reco_result.h_max

                # Reconstructed position of the shower's core on the ground
                reco_core_x = reco_result.core_x
                reco_core_y = reco_result.core_y

                # Reconstructed direction of the shower's in the sky
                alt, az = reco_result.alt, reco_result.az

                # Successfully reconstructed shower
                is_valid = True

            else:  # no successful reconstruction assign dummy values

                xi = np.nan * u.deg
                offset = np.nan * u.deg
                reco_core_x = np.nan * u.m
                reco_core_y = np.nan * u.m
                h_max = np.nan * u.m
                alt = np.nan * u.deg
                az = np.nan * u.deg
                is_valid = False
                reco_energy = np.nan
                score = np.nan
                gammaness = np.nan
                reco_event["success"] = False

            # Estimate particle energy
            if use_regressor and is_valid:
                energy_tel = np.zeros(len(hillas_dict.keys()))
                energy_tel_classifier = {}
                weight_tel = np.zeros(len(hillas_dict.keys()))

                for idx, tel_id in enumerate(hillas_dict.keys()):

                    cam_id = source.subarray.tel[tel_id].camera.camera_name
                    moments = hillas_dict[tel_id]

                    model = regressors[cam_id]

                    ############################################################
                    #                  GET FEATURES
                    ############################################################

                    # Read feature list from model configutation file
                    features_basic = regressor_config["FeatureList"]["Basic"]
                    features_derived = regressor_config["FeatureList"][
                        "Derived"]
                    features = features_basic + list(features_derived)

                    # Create a pandas Dataframe with basic quantities
                    # This is needed in order to connect the I/O system of the
                    # model inputs to the in-memory computation of this script
                    data = pd.DataFrame({
                        "hillas_intensity": [moments.intensity],
                        "hillas_width": [moments.width.to("deg").value],
                        "hillas_length": [moments.length.to("deg").value],
                        "hillas_x": [moments.x.to("deg").value],
                        "hillas_y": [moments.y.to("deg").value],
                        "hillas_phi": [moments.phi.to("deg").value],
                        "hillas_r": [moments.r.to("deg").value],
                        "leakage_intensity_width_1_reco":
                        [leakage_dict[tel_id]['leak1_reco']],
                        "leakage_intensity_width_2_reco":
                        [leakage_dict[tel_id]['leak2_reco']],
                        "leakage_intensity_width_1":
                        [leakage_dict[tel_id]['leak1']],
                        "leakage_intensity_width_2":
                        [leakage_dict[tel_id]['leak2']],
                        "az": [reco_result.az.to("deg").value],
                        "alt": [reco_result.alt.to("deg").value],
                        "h_max": [h_max.value],
                        "impact_dist": [impact_dict[tel_id].to("m").value],
                    })

                    # Compute derived features and add them to the dataframe
                    for key, expression in features_derived.items():
                        if key not in data:
                            data.eval(f'{key} = {expression}', inplace=True)

                    # sort features_to_use alphabetically to ensure order
                    # preservation with model.fit in protopipe.mva
                    features = sorted(features)

                    # Select the values for the full set of features
                    features_values = data[features].to_numpy()

                    ############################################################

                    if good_for_reco[tel_id] == 1:
                        energy_tel[idx] = model.predict(features_values)
                    else:
                        energy_tel[idx] = np.nan

                    weight_tel[idx] = moments.intensity

                    # Record the values regardless of the validity
                    # We don't use this now, but it should be recorded
                    energy_tel_classifier[tel_id] = energy_tel[idx]

                # Use only images with valid estimated energies to calculate
                # the average
                energy_tel_selected = energy_tel[~np.isnan(energy_tel)]
                weight_tel_selected = weight_tel[~np.isnan(energy_tel)]

                # Try getting the average weighted energy of the shower
                # If no image had a valid estimated energy record it as nan
                if len(energy_tel_selected) == 0:
                    reco_energy = np.nan
                    energy_estimated = False
                else:
                    reco_energy = np.sum(
                        weight_tel_selected *
                        energy_tel_selected) / sum(weight_tel_selected)
                    energy_estimated = True
            else:
                reco_energy = np.nan
                energy_estimated = False

            # Estimate particle score/gammaness
            if use_classifier and is_valid:
                score_tel = np.zeros(len(hillas_dict.keys()))
                gammaness_tel = np.zeros(len(hillas_dict.keys()))
                weight_tel = np.zeros(len(hillas_dict.keys()))

                for idx, tel_id in enumerate(hillas_dict.keys()):

                    cam_id = source.subarray.tel[tel_id].camera.camera_name
                    moments = hillas_dict[tel_id]

                    model = classifiers[cam_id]

                    ############################################################
                    #                  GET FEATURES
                    ############################################################

                    # Read feature list from model configutation file
                    features_basic = classifier_config["FeatureList"]["Basic"]
                    features_derived = classifier_config["FeatureList"][
                        "Derived"]
                    features = features_basic + list(features_derived)

                    # Create a pandas Dataframe with basic quantities
                    # This is needed in order to connect the I/O system of the
                    # model inputs to the in-memory computation of this script
                    data = pd.DataFrame({
                        "hillas_intensity": [moments.intensity],
                        "hillas_width": [moments.width.to("deg").value],
                        "hillas_length": [moments.length.to("deg").value],
                        "hillas_x": [moments.x.to("deg").value],
                        "hillas_y": [moments.y.to("deg").value],
                        "hillas_phi": [moments.phi.to("deg").value],
                        "hillas_r": [moments.r.to("deg").value],
                        "leakage_intensity_width_1_reco":
                        [leakage_dict[tel_id]['leak1_reco']],
                        "leakage_intensity_width_2_reco":
                        [leakage_dict[tel_id]['leak2_reco']],
                        "leakage_intensity_width_1":
                        [leakage_dict[tel_id]['leak1']],
                        "leakage_intensity_width_2":
                        [leakage_dict[tel_id]['leak2']],
                        "az": [reco_result.az.to("deg").value],
                        "alt": [reco_result.alt.to("deg").value],
                        "h_max": [h_max.value],
                        "impact_dist": [impact_dict[tel_id].to("m").value],
                        "reco_energy":
                        reco_energy,
                        "reco_energy_tel":
                        energy_tel_classifier[tel_id],
                    })

                    # Compute derived features and add them to the dataframe
                    for key, expression in features_derived.items():
                        if key not in data:
                            data.eval(f'{key} = {expression}', inplace=True)

                    # sort features_to_use alphabetically to ensure order
                    # preservation with model.fit in protopipe.mva
                    features = sorted(features)

                    # Select the values for the full set of features
                    features_values = data[features].to_numpy()

                    ############################################################

                    # Here we check for valid telescope-wise energies
                    # Because it means that it's a good image
                    # WARNING: currently we should REQUIRE to estimate both
                    # energy AND particle type
                    if not np.isnan(energy_tel_classifier[tel_id]):
                        # Output of classifier according to type of classifier
                        if use_proba_for_classifier is False:
                            score_tel[idx] = model.decision_function(
                                features_values)
                        else:
                            gammaness_tel[idx] = model.predict_proba(
                                features_values)[:, 1]
                        weight_tel[idx] = np.sqrt(moments.intensity)
                    else:
                        # WARNING:
                        # this is true only because we use telescope-wise
                        # energies as a feature of the model!!!
                        score_tel[idx] = np.nan
                        gammaness_tel[idx] = np.nan

                # Use only images with valid estimated energies to calculate
                # the average
                if use_proba_for_classifier is False:
                    score_tel_selected = score_tel[~np.isnan(score_tel)]
                    weight_tel_selected = weight_tel[~np.isnan(score_tel)]
                else:
                    gammaness_tel_selected = gammaness_tel[
                        ~np.isnan(gammaness_tel)]
                    weight_tel_selected = weight_tel[~np.isnan(gammaness_tel)]

                # Try getting the average weighted score or gammaness
                # If no image had a valid estimated energy record it as nan
                if len(weight_tel_selected) > 0:

                    # Weight the final decision/proba
                    if use_proba_for_classifier is True:
                        gammaness = np.sum(
                            weight_tel_selected *
                            gammaness_tel_selected) / sum(weight_tel_selected)
                    else:
                        score = np.sum(
                            weight_tel_selected *
                            score_tel_selected) / sum(weight_tel_selected)

                    particle_type_estimated = True

                else:

                    score = np.nan
                    gammaness = np.nan
                    particle_type_estimated = False

            else:
                score = np.nan
                gammaness = np.nan
                particle_type_estimated = False

            if energy_estimated and particle_type_estimated:
                reco_event["success"] = True
            else:
                if args.debug:
                    print(
                        bcolors.WARNING +
                        f"energy_estimated = {energy_estimated}\n" +
                        f"particle_type_estimated = {particle_type_estimated}\n"
                        + bcolors.ENDC)
                reco_event["success"] = False

            # If the user wants to save the images of the run
            if args.save_images is True:
                for idx, tel_id in enumerate(hillas_dict.keys()):
                    cam_id = source.subarray.tel[tel_id].camera.camera_name
                    if cam_id not in images_phe:

                        n_pixels = source.subarray.tel[
                            tel_id].camera.geometry.n_pixels
                        StoredImages["true_image"] = tb.Float32Col(
                            shape=(n_pixels), pos=2)
                        StoredImages["reco_image"] = tb.Float32Col(
                            shape=(n_pixels), pos=3)
                        StoredImages["cleaning_mask_reco"] = tb.BoolCol(
                            shape=(n_pixels), pos=4)  # not in ctapipe
                        StoredImages["cleaning_mask_clusters"] = tb.BoolCol(
                            shape=(n_pixels), pos=5)  # not in ctapipe

                        images_table[cam_id] = images_outfile.create_table(
                            "/", "_".join(["images", cam_id]), StoredImages)
                    images_phe[cam_id] = images_table[cam_id].row

                    images_phe[cam_id]["event_id"] = event.index.event_id
                    images_phe[cam_id]["tel_id"] = tel_id
                    images_phe[cam_id]["reco_image"] = reco_image[tel_id]
                    images_phe[cam_id]["true_image"] = true_image[tel_id]
                    images_phe[cam_id][
                        "cleaning_mask_reco"] = cleaning_mask_reco[tel_id]
                    images_phe[cam_id][
                        "cleaning_mask_clusters"] = cleaning_mask_clusters[
                            tel_id]

                    images_phe[cam_id].append()

            # Now we start recording the data to file
            reco_event["event_id"] = event.index.event_id
            reco_event["obs_id"] = event.index.obs_id
            reco_event["NTels_trig"] = len(event.r1.tel.keys())
            reco_event["NTels_reco"] = len(hillas_dict)
            reco_event["NTels_reco_lst"] = n_tels["LST_LST_LSTCam"]
            reco_event["NTels_reco_mst"] = (n_tels["MST_MST_NectarCam"] +
                                            n_tels["MST_MST_FlashCam"] +
                                            n_tels["MST_SCT_SCTCam"])
            reco_event["NTels_reco_sst"] = (n_tels["SST_1M_DigiCam"] +
                                            n_tels["SST_ASTRI_ASTRICam"] +
                                            n_tels["SST_GCT_CHEC"])
            reco_event["pointing_az"] = pointing_az.to("deg").value
            reco_event["pointing_alt"] = pointing_alt.to("deg").value
            reco_event["reco_energy"] = reco_energy
            reco_event["reco_alt"] = alt.to("deg").value
            reco_event["reco_az"] = az.to("deg").value
            reco_event["offset"] = offset.to("deg").value
            reco_event["xi"] = xi.to("deg").value
            reco_event["h_max"] = h_max.to("m").value
            reco_event["reco_core_x"] = reco_core_x.to("m").value
            reco_event["reco_core_y"] = reco_core_y.to("m").value
            reco_event["is_valid"] = is_valid

            if use_proba_for_classifier is True:
                reco_event["gammaness"] = gammaness
            else:
                reco_event["score"] = score
            reco_event["ErrEstPos"] = np.nan
            reco_event["ErrEstDir"] = np.nan

            # Simulated information
            shower = event.simulation.shower
            mc_core_x = shower.core_x
            mc_core_y = shower.core_y
            reco_event["true_energy"] = shower.energy.to("TeV").value
            reco_event["true_az"] = true_az.to("deg").value
            reco_event["true_alt"] = true_alt.to("deg").value
            reco_event["true_core_x"] = mc_core_x.to("m").value
            reco_event["true_core_y"] = mc_core_y.to("m").value

            # Fill table
            reco_table.flush()
            reco_event.append()

            if signal_handler.stop:
                break
        if signal_handler.stop:
            break

    # make sure everything gets written out nicely
    reco_table.flush()

    if args.save_images is True:
        for table in images_table.values():
            table.flush()

    try:
        print()
        evt_cutflow()
        print()
        img_cutflow()

    except ZeroDivisionError:
        pass

    print("Job done!")
Example #9
0
def main():

    # Argument parser
    parser = make_argparser()
    parser.add_argument(
        "--estimate_energy",
        type=str2bool,
        default=False,
        help="Make estimation of energy",
    )
    parser.add_argument("--regressor_dir",
                        type=str,
                        default="./",
                        help="regressors directory")
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    # Read site layout
    site = cfg["General"]["site"]
    array = cfg["General"]["array"]

    if args.infile_list:
        filenamelist = []
        for f in args.infile_list:
            filenamelist += glob("{}/{}".format(args.indir, f))
        filenamelist.sort()
    else:
        raise ValueError("don't know which input to use...")

    if not filenamelist:
        print("no files found; check indir: {}".format(args.indir))
        exit(-1)
    else:
        print("found {} files".format(len(filenamelist)))

    # keeping track of events and where they were rejected
    evt_cutflow = CutFlow("EventCutFlow")
    img_cutflow = CutFlow("ImageCutFlow")

    preper = EventPreparer(config=cfg,
                           mode=args.mode,
                           event_cutflow=evt_cutflow,
                           image_cutflow=img_cutflow)

    # catch ctr-c signal to exit current loop and still display results
    signal_handler = SignalHandler()
    signal.signal(signal.SIGINT, signal_handler)

    # Regressor method
    regressor_method = cfg["EnergyRegressor"]["method_name"]

    # wrapper for the scikit-learn regressor
    if args.estimate_energy is True:
        # regressor_files = args.regressor_dir + "/regressor_{mode}_{cam_id}_{regressor}.pkl.gz"
        regressor_files = (args.regressor_dir +
                           "/regressor_{mode}_{cam_id}_{regressor}.pkl.gz")
        reg_file = regressor_files.format(
            **{
                "mode": args.mode,
                "wave_args": "mixed",  # ToDo, control
                "regressor": regressor_method,
                "cam_id": "{cam_id}",
            })

        # from IPython import embed
        # embed()

        regressor = EnergyRegressor.load(reg_file, cam_id_list=args.cam_ids)

    class EventFeatures(tb.IsDescription):
        impact_dist = tb.Float32Col(dflt=1, pos=0)
        sum_signal_evt = tb.Float32Col(dflt=1, pos=1)
        max_signal_cam = tb.Float32Col(dflt=1, pos=2)
        sum_signal_cam = tb.Float32Col(dflt=1, pos=3)
        N_LST = tb.Int16Col(dflt=1, pos=4)
        N_MST = tb.Int16Col(dflt=1, pos=5)
        N_SST = tb.Int16Col(dflt=1, pos=6)
        width = tb.Float32Col(dflt=1, pos=7)
        length = tb.Float32Col(dflt=1, pos=8)
        skewness = tb.Float32Col(dflt=1, pos=9)
        kurtosis = tb.Float32Col(dflt=1, pos=10)
        h_max = tb.Float32Col(dflt=1, pos=11)
        err_est_pos = tb.Float32Col(dflt=1, pos=12)
        err_est_dir = tb.Float32Col(dflt=1, pos=13)
        mc_energy = tb.FloatCol(dflt=1, pos=14)
        local_distance = tb.Float32Col(dflt=1, pos=15)
        n_pixel = tb.Int16Col(dflt=1, pos=16)
        n_cluster = tb.Int16Col(dflt=-1, pos=17)
        obs_id = tb.Int16Col(dflt=1, pos=18)
        event_id = tb.Int32Col(dflt=1, pos=19)
        tel_id = tb.Int16Col(dflt=1, pos=20)
        xi = tb.Float32Col(dflt=np.nan, pos=21)
        reco_energy = tb.FloatCol(dflt=np.nan, pos=22)
        ellipticity = tb.FloatCol(dflt=1, pos=23)
        n_tel_reco = tb.FloatCol(dflt=1, pos=24)
        n_tel_discri = tb.FloatCol(dflt=1, pos=25)
        mc_core_x = tb.FloatCol(dflt=1, pos=26)
        mc_core_y = tb.FloatCol(dflt=1, pos=27)
        reco_core_x = tb.FloatCol(dflt=1, pos=28)
        reco_core_y = tb.FloatCol(dflt=1, pos=29)
        mc_h_first_int = tb.FloatCol(dflt=1, pos=30)
        offset = tb.Float32Col(dflt=np.nan, pos=31)
        mc_x_max = tb.Float32Col(dflt=np.nan, pos=31)
        alt = tb.Float32Col(dflt=np.nan, pos=33)
        az = tb.Float32Col(dflt=np.nan, pos=34)
        reco_energy_tel = tb.Float32Col(dflt=np.nan, pos=35)
        # from hillas_reco
        ellipticity_reco = tb.FloatCol(dflt=1, pos=36)
        local_distance_reco = tb.Float32Col(dflt=1, pos=37)
        skewness_reco = tb.Float32Col(dflt=1, pos=38)
        kurtosis_reco = tb.Float32Col(dflt=1, pos=39)
        width_reco = tb.Float32Col(dflt=1, pos=40)
        length_reco = tb.Float32Col(dflt=1, pos=41)
        psi = tb.Float32Col(dflt=1, pos=42)
        psi_reco = tb.Float32Col(dflt=1, pos=43)
        sum_signal_cam_reco = tb.Float32Col(dflt=1, pos=44)

    feature_outfile = tb.open_file(args.outfile, mode="w")
    feature_table = {}
    feature_events = {}

    # Telescopes in analysis
    allowed_tels = set(prod3b_tel_ids(array, site=site))

    for i, filename in enumerate(filenamelist):

        print("file: {} filename = {}".format(i, filename))

        source = event_source(input_url=filename,
                              allowed_tels=allowed_tels,
                              max_events=args.max_events)

        # loop that cleans and parametrises the images and performs the reconstruction
        # for each event
        for (
                event,
                n_pixel_dict,
                hillas_dict,
                hillas_dict_reco,
                n_tels,
                tot_signal,
                max_signals,
                n_cluster_dict,
                reco_result,
                impact_dict,
        ) in preper.prepare_event(source):

            # Angular quantities
            run_array_direction = event.mcheader.run_array_direction

            xi = angular_separation(event.mc.az, event.mc.alt, reco_result.az,
                                    reco_result.alt)

            offset = angular_separation(
                run_array_direction[0],  # az
                run_array_direction[1],  # alt
                reco_result.az,
                reco_result.alt,
            )

            # Impact parameter
            reco_core_x = reco_result.core_x
            reco_core_y = reco_result.core_y

            # Height of shower maximum
            h_max = reco_result.h_max
            # Todo add conversion in number of radiation length, need an atmosphere profile

            reco_energy = np.nan
            reco_energy_tel = dict()

            # Not optimal at all, two loop on tel!!!
            # For energy estimation
            if args.estimate_energy is True:
                weight_tel = np.zeros(len(hillas_dict.keys()))
                energy_tel = np.zeros(len(hillas_dict.keys()))

                for idx, tel_id in enumerate(hillas_dict.keys()):
                    cam_id = event.inst.subarray.tel[tel_id].camera.cam_id
                    moments = hillas_dict[tel_id]
                    model = regressor.model_dict[cam_id]

                    features_img = np.array([
                        np.log10(moments.intensity),
                        np.log10(impact_dict[tel_id].value),
                        moments.width.value,
                        moments.length.value,
                        h_max.value,
                    ])

                    energy_tel[idx] = model.predict([features_img])
                    weight_tel[idx] = moments.intensity
                    reco_energy_tel[tel_id] = energy_tel[idx]

                reco_energy = np.sum(weight_tel * energy_tel) / sum(weight_tel)
            else:
                for idx, tel_id in enumerate(hillas_dict.keys()):
                    reco_energy_tel[tel_id] = np.nan

            for idx, tel_id in enumerate(hillas_dict.keys()):
                cam_id = event.inst.subarray.tel[tel_id].camera.cam_id

                if cam_id not in feature_events:
                    feature_table[cam_id] = feature_outfile.create_table(
                        "/", "_".join(["feature_events", cam_id]),
                        EventFeatures)
                    feature_events[cam_id] = feature_table[cam_id].row

                moments = hillas_dict[tel_id]
                ellipticity = moments.width / moments.length

                # Write to file also the Hillas parameters that have been used
                # to calculate reco_results

                moments_reco = hillas_dict_reco[tel_id]
                ellipticity_reco = moments_reco.width / moments_reco.length

                feature_events[cam_id]["impact_dist"] = (
                    impact_dict[tel_id].to("m").value)
                feature_events[cam_id]["sum_signal_evt"] = tot_signal
                feature_events[cam_id]["max_signal_cam"] = max_signals[tel_id]
                feature_events[cam_id]["sum_signal_cam"] = moments.intensity
                feature_events[cam_id]["N_LST"] = n_tels["LST"]
                feature_events[cam_id]["N_MST"] = n_tels["MST"]
                feature_events[cam_id]["N_SST"] = n_tels["SST"]
                feature_events[cam_id]["width"] = moments.width.to("m").value
                feature_events[cam_id]["length"] = moments.length.to("m").value
                feature_events[cam_id]["psi"] = moments.psi.to("deg").value
                feature_events[cam_id]["skewness"] = moments.skewness
                feature_events[cam_id]["kurtosis"] = moments.kurtosis
                feature_events[cam_id]["h_max"] = h_max.to("m").value
                feature_events[cam_id]["err_est_pos"] = np.nan
                feature_events[cam_id]["err_est_dir"] = np.nan
                feature_events[cam_id]["mc_energy"] = event.mc.energy.to(
                    "TeV").value
                feature_events[cam_id]["local_distance"] = moments.r.to(
                    "m").value
                feature_events[cam_id]["n_pixel"] = n_pixel_dict[tel_id]
                feature_events[cam_id]["obs_id"] = event.r0.obs_id
                feature_events[cam_id]["event_id"] = event.r0.event_id
                feature_events[cam_id]["tel_id"] = tel_id
                feature_events[cam_id]["xi"] = xi.to("deg").value
                feature_events[cam_id]["reco_energy"] = reco_energy
                feature_events[cam_id]["ellipticity"] = ellipticity.value
                feature_events[cam_id]["n_cluster"] = n_cluster_dict[tel_id]
                feature_events[cam_id]["n_tel_reco"] = n_tels["reco"]
                feature_events[cam_id]["n_tel_discri"] = n_tels["discri"]
                feature_events[cam_id]["mc_core_x"] = event.mc.core_x.to(
                    "m").value
                feature_events[cam_id]["mc_core_y"] = event.mc.core_y.to(
                    "m").value
                feature_events[cam_id]["reco_core_x"] = reco_core_x.to(
                    "m").value
                feature_events[cam_id]["reco_core_y"] = reco_core_y.to(
                    "m").value
                feature_events[cam_id][
                    "mc_h_first_int"] = event.mc.h_first_int.to("m").value
                feature_events[cam_id]["offset"] = offset.to("deg").value
                feature_events[cam_id][
                    "mc_x_max"] = event.mc.x_max.value  # g / cm2
                feature_events[cam_id]["alt"] = reco_result.alt.to("deg").value
                feature_events[cam_id]["az"] = reco_result.az.to("deg").value
                feature_events[cam_id]["reco_energy_tel"] = reco_energy_tel[
                    tel_id]
                # Variables from hillas_dist_reco
                feature_events[cam_id][
                    "ellipticity_reco"] = ellipticity_reco.value
                feature_events[cam_id][
                    "local_distance_reco"] = moments_reco.r.to("m").value
                feature_events[cam_id]["skewness_reco"] = moments_reco.skewness
                feature_events[cam_id]["kurtosis_reco"] = moments_reco.kurtosis
                feature_events[cam_id]["width_reco"] = moments_reco.width.to(
                    "m").value
                feature_events[cam_id]["length_reco"] = moments_reco.length.to(
                    "m").value
                feature_events[cam_id]["psi_reco"] = moments_reco.psi.to(
                    "deg").value
                feature_events[cam_id][
                    "sum_signal_cam_reco"] = moments_reco.intensity

                feature_events[cam_id].append()

            if signal_handler.stop:
                break
        if signal_handler.stop:
            break
    # make sure that all the events are properly stored
    for table in feature_table.values():
        table.flush()

    img_cutflow()
    evt_cutflow()
Example #10
0
def main():

    # Read arguments
    parser = argparse.ArgumentParser(
        description="Build model for regression/classification")
    parser.add_argument("--config_file", type=str, required=True)
    parser.add_argument(
        "--max_events",
        type=int,
        default=-1,
        help="maximum number of events for training",
    )
    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument(
        "--wave",
        dest="mode",
        action="store_const",
        const="wave",
        default="tail",
        help="if set, use wavelet cleaning",
    )
    mode_group.add_argument(
        "--tail",
        dest="mode",
        action="store_const",
        const="tail",
        help="if set, use tail cleaning, otherwise wavelets",
    )
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    # Type of model (regression or classification)
    model_type = cfg["General"]["model_type"]

    # Import parameters
    data_dir = cfg["General"]["data_dir"]
    outdir = cfg["General"]["outdir"]
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    cam_ids = cfg["General"]["cam_id_list"]
    table_name_template = cfg["General"]["table_name_template"]
    table_name = [table_name_template + cam_id for cam_id in cam_ids]

    # List of features
    feature_list = cfg["FeatureList"]

    # Optimisation parameters
    method_name = cfg["Method"]["name"]
    tuned_parameters = [cfg["Method"]["tuned_parameters"]]
    scoring = "explained_variance"
    cv = cfg["Method"]["cv"]

    # Split fraction
    train_fraction = cfg["Split"]["train_fraction"]

    if model_type in "regressor":
        data_file = cfg["General"]["data_file"].format(args.mode)
        filename = path.join(data_dir, data_file)

        # List of cuts
        cuts = make_cut_list(cfg["SigFiducialCuts"])
        init_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None))

        # Name of target
        target_name = cfg["Method"]["target_name"]

    elif model_type in "classifier":
        data_sig_file = cfg["General"]["data_sig_file"].format(args.mode)
        data_bkg_file = cfg["General"]["data_bkg_file"].format(args.mode)
        filename_sig = path.join(data_dir, data_sig_file)
        filename_bkg = path.join(data_dir, data_bkg_file)

        # List of cuts
        sig_cuts = make_cut_list(cfg["SigFiducialCuts"])
        bkg_cuts = make_cut_list(cfg["BkgFiducialCuts"])

        # Model
        if method_name in "AdaBoostClassifier":
            init_model = AdaBoostClassifier(
                DecisionTreeClassifier(max_depth=4))
        elif method_name in "RandomForestClassifier":
            init_model = RandomForestClassifier(
                n_estimators=500,
                max_depth=None,
                min_samples_split=0.05,
                max_features="sqrt",
                bootstrap=True,
                random_state=None,
                criterion="gini",
                class_weight=
                "balanced_subsample",  # Reweight events for each tree
            )
        use_same_number_of_sig_and_bkg_for_training = cfg["Split"][
            "use_same_number_of_sig_and_bkg_for_training"]

    print("### Using {} for model construction".format(method_name))

    models = dict()
    for idx, cam_id in enumerate(cam_ids):

        print("### Building model for {}".format(cam_id))

        if model_type in "regressor":
            # Load data
            data = pd.read_hdf(filename, table_name[idx], mode="r")
            data = prepare_data(ds=data, cuts=cuts)[0:args.max_events]

            # Init model factory
            factory = TrainModel(case=model_type,
                                 target_name=target_name,
                                 feature_name_list=feature_list)

            # Split data
            factory.split_data(data_sig=data, train_fraction=train_fraction)
            print("Training sample: sig {}".format(len(factory.data_train)))
            print("Test sample: sig {}".format(len(factory.data_test)))
        elif model_type in "classifier":
            # Load data
            data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r")
            data_bkg = pd.read_hdf(filename_bkg, table_name[idx], mode="r")

            # Add label
            data_sig = prepare_data(ds=data_sig, label=1, cuts=sig_cuts)
            data_bkg = prepare_data(ds=data_bkg, label=0, cuts=bkg_cuts)

            data_sig = data_sig[0:args.max_events]
            data_bkg = data_bkg[0:args.max_events]

            # Init model factory
            factory = TrainModel(case=model_type,
                                 target_name="label",
                                 feature_name_list=feature_list)

            # Split data
            factory.split_data(
                data_sig=data_sig,
                data_bkg=data_bkg,
                train_fraction=train_fraction,
                force_same_nsig_nbkg=
                use_same_number_of_sig_and_bkg_for_training,
            )

            print("Training sample: sig {} and bkg {}".format(
                len(factory.data_train.query("label==1")),
                len(factory.data_train.query("label==0")),
            ))
            print("Test sample: sig {} and bkg {}".format(
                len(factory.data_test.query("label==1")),
                len(factory.data_test.query("label==0")),
            ))

        # Build model
        best_model = factory.get_optimal_model(init_model,
                                               tuned_parameters,
                                               scoring=scoring,
                                               cv=cv)

        if model_type in "classifier":
            # print report
            if model_type in "classifier":
                print(
                    classification_report(
                        factory.data_scikit["y_test"],
                        best_model.predict(factory.data_scikit["X_test"]),
                    ))

            # Calibrate model if necessary on test data
            if cfg["Method"]["calibrate_output"] is True:
                print("==> Calibrate classifier...")

                best_model = CalibratedClassifierCV(best_model,
                                                    method="sigmoid",
                                                    cv="prefit")

                best_model.fit(factory.data_scikit["X_test"],
                               factory.data_scikit["y_test"])

        # save model
        models[cam_id] = best_model
        outname = "{}_{}_{}_{}.pkl.gz".format(model_type, args.mode, cam_id,
                                              method_name)
        joblib.dump(best_model, path.join(outdir, outname))

        # save data
        save_obj(
            factory.data_scikit,
            path.join(
                outdir,
                "data_scikit_{}_{}_{}_{}.pkl.gz".format(
                    model_type, method_name, args.mode, cam_id),
            ),
        )
        factory.data_train.to_pickle(
            path.join(
                outdir,
                "data_train_{}_{}_{}_{}.pkl.gz".format(model_type, method_name,
                                                       args.mode, cam_id),
            ))
        factory.data_test.to_pickle(
            path.join(
                outdir,
                "data_test_{}_{}_{}_{}.pkl.gz".format(model_type, method_name,
                                                      args.mode, cam_id),
            ))
Example #11
0
def main():

    # Read arguments
    parser = argparse.ArgumentParser(
        description='Build model for regression/classification')
    parser.add_argument('--config_file', type=str, required=True)
    parser.add_argument('--max_events',
                        type=int,
                        default=-1,
                        help="maximum number of events for training")
    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument('--wave',
                            dest="mode",
                            action='store_const',
                            const="wave",
                            default="tail",
                            help="if set, use wavelet cleaning")
    mode_group.add_argument(
        '--tail',
        dest="mode",
        action='store_const',
        const="tail",
        help="if set, use tail cleaning, otherwise wavelets")
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    # Type of model (regression or classification)
    model_type = cfg['General']['model_type']

    # Import parameters
    data_dir = cfg['General']['data_dir']
    outdir = cfg['General']['outdir']
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    cam_ids = cfg['General']['cam_id_list']
    table_name_template = cfg['General']['table_name_template']
    table_name = [table_name_template + cam_id for cam_id in cam_ids]

    # List of features
    feature_list = cfg['FeatureList']

    # Optimisation parameters
    method_name = cfg['Method']['name']
    tuned_parameters = [cfg['Method']['tuned_parameters']]
    scoring = 'explained_variance'
    cv = cfg['Method']['cv']

    # Split fraction
    train_fraction = cfg['Split']['train_fraction']

    if model_type in 'regressor':
        data_file = cfg['General']['data_file'].format(args.mode)
        filename = path.join(data_dir, data_file)

        # List of cuts
        cuts = make_cut_list(cfg['SigFiducialCuts'])
        init_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None))

        # Name of target
        target_name = cfg['Method']['target_name']

    elif model_type in 'classifier':
        data_sig_file = cfg['General']['data_sig_file'].format(args.mode)
        data_bkg_file = cfg['General']['data_bkg_file'].format(args.mode)
        filename_sig = path.join(data_dir, data_sig_file)
        filename_bkg = path.join(data_dir, data_bkg_file)

        # List of cuts
        sig_cuts = make_cut_list(cfg['SigFiducialCuts'])
        bkg_cuts = make_cut_list(cfg['BkgFiducialCuts'])

        # Model
        if method_name in 'AdaBoostClassifier':
            init_model = AdaBoostClassifier(
                DecisionTreeClassifier(max_depth=4))
        elif method_name in 'RandomForestClassifier':
            init_model = RandomForestClassifier(
                n_estimators=500,
                max_depth=None,
                min_samples_split=0.05,
                max_features='sqrt',
                bootstrap=True,
                random_state=None,
                criterion='gini',
                class_weight=
                'balanced_subsample'  # Reweight events for each tree
            )
        use_same_number_of_sig_and_bkg_for_training = cfg['Split'][
            'use_same_number_of_sig_and_bkg_for_training']

    print('### Using {} for model construction'.format(method_name))

    models = dict()
    for idx, cam_id in enumerate(cam_ids):

        print('### Building model for {}'.format(cam_id))

        if model_type in 'regressor':
            # Load data
            data = pd.read_hdf(filename, table_name[idx], mode='r')
            data = prepare_data(ds=data, cuts=cuts)[0:args.max_events]

            # Init model factory
            factory = TrainModel(case=model_type,
                                 target_name=target_name,
                                 feature_name_list=feature_list)

            # Split data
            factory.split_data(data_sig=data, train_fraction=train_fraction)
            print('Training sample: sig {}'.format(len(factory.data_train), ))
            print('Test sample: sig {}'.format(len(factory.data_test)))
        elif model_type in 'classifier':
            # Load data
            data_sig = pd.read_hdf(filename_sig, table_name[idx], mode='r')
            data_bkg = pd.read_hdf(filename_bkg, table_name[idx], mode='r')

            # Add label
            data_sig = prepare_data(ds=data_sig, label=1, cuts=sig_cuts)
            data_bkg = prepare_data(ds=data_bkg, label=0, cuts=bkg_cuts)

            data_sig = data_sig[0:args.max_events]
            data_bkg = data_bkg[0:args.max_events]

            # Init model factory
            factory = TrainModel(case=model_type,
                                 target_name='label',
                                 feature_name_list=feature_list)

            # Split data
            factory.split_data(
                data_sig=data_sig,
                data_bkg=data_bkg,
                train_fraction=train_fraction,
                force_same_nsig_nbkg=use_same_number_of_sig_and_bkg_for_training
            )

            print('Training sample: sig {} and bkg {}'.format(
                len(factory.data_train.query('label==1')),
                len(factory.data_train.query('label==0'))))
            print('Test sample: sig {} and bkg {}'.format(
                len(factory.data_test.query('label==1')),
                len(factory.data_test.query('label==0'))))

        # Build model
        best_model = factory.get_optimal_model(init_model,
                                               tuned_parameters,
                                               scoring=scoring,
                                               cv=cv)

        if model_type in 'classifier':
            # print report
            if model_type in 'classifier':
                print(
                    classification_report(
                        factory.data_scikit['y_test'],
                        best_model.predict(factory.data_scikit['X_test'])))

            # Calibrate model if necessary on test data
            if cfg['Method']['calibrate_output'] is True:
                print('==> Calibrate classifier...')

                best_model = CalibratedClassifierCV(best_model,
                                                    method='sigmoid',
                                                    cv='prefit')

                best_model.fit(factory.data_scikit['X_test'],
                               factory.data_scikit['y_test'])

        # save model
        models[cam_id] = best_model
        outname = '{}_{}_{}_{}.pkl.gz'.format(model_type, args.mode, cam_id,
                                              method_name)
        joblib.dump(best_model, path.join(outdir, outname))

        # save data
        save_obj(
            factory.data_scikit,
            path.join(
                outdir, 'data_scikit_{}_{}_{}_{}.pkl.gz'.format(
                    model_type, method_name, args.mode, cam_id)))
        factory.data_train.to_pickle(
            path.join(
                outdir,
                'data_train_{}_{}_{}_{}.pkl.gz'.format(model_type, method_name,
                                                       args.mode, cam_id)))
        factory.data_test.to_pickle(
            path.join(
                outdir,
                'data_test_{}_{}_{}_{}.pkl.gz'.format(model_type, method_name,
                                                      args.mode, cam_id)))
Example #12
0
def main():
    # Read arguments
    parser = argparse.ArgumentParser(description="Make performance files")
    parser.add_argument("--config_file", type=str, required=True, help="")
    parser.add_argument(
        "--obs_time",
        type=str,
        required=True,
        help="Observation time, should be given as a string, value and astropy unit separated by an empty space",
    )
    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument(
        "--wave",
        dest="mode",
        action="store_const",
        const="wave",
        default="tail",
        help="if set, use wavelet cleaning",
    )
    mode_group.add_argument(
        "--tail",
        dest="mode",
        action="store_const",
        const="tail",
        help="if set, use tail cleaning, otherwise wavelets",
    )
    args = parser.parse_args()

    # Read configuration file
    cfg = load_config(args.config_file)

    # Add obs. time in configuration file
    str_obs_time = args.obs_time.split()
    cfg["analysis"]["obs_time"] = {
        "value": float(str_obs_time[0]),
        "unit": str(str_obs_time[-1]),
    }

    # Create output directory if necessary
    outdir = os.path.join(
        cfg["general"]["outdir"],
        "irf_{}_ThSq_{}_Time{:.2f}{}".format(
            args.mode,
            cfg["analysis"]["thsq_opt"]["type"],
            cfg["analysis"]["obs_time"]["value"],
            cfg["analysis"]["obs_time"]["unit"],
        ),
    )
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    indir = cfg["general"]["indir"]
    template_input_file = cfg["general"]["template_input_file"]

    # Load data
    particles = ["gamma", "electron", "proton"]
    evt_dict = dict()  # Contain DL2 file for each type of particle
    for particle in particles:
        # template looks like dl2_{}_{}_merged.h5
        infile = os.path.join(indir, template_input_file.format(args.mode, particle))
        evt_dict[particle] = pd.read_hdf(infile, key="reco_events")

    # Apply offset cut to proton and electron
    for particle in ["electron", "proton"]:
        # print('Initial stat: {} {}'.format(len(evt_dict[particle]), particle))
        evt_dict[particle] = evt_dict[particle].query('offset <= {}'.format(
            cfg['analysis']['max_bg_radius'])
        )

    # Add required data in configuration file for future computation
    for particle in particles:
        cfg['particle_information'][particle]['n_files'] = \
            len(np.unique(evt_dict[particle]['obs_id']))
        cfg['particle_information'][particle]['n_simulated'] = \
            cfg['particle_information'][particle]['n_files'] * cfg['particle_information'][particle]['num_showers'] * cfg['particle_information'][particle]['num_use']

    # Define model for the particles
    model_dict = {
        "gamma": CrabSpectrum("hegra").model,
        "proton": cosmic_ray_flux,
        "electron": cosmic_ray_flux,
    }

    # Reco energy binning
    cfg_binning = cfg["analysis"]["ereco_binning"]
    ereco = (
        np.logspace(
            np.log10(cfg_binning["emin"]),
            np.log10(cfg_binning["emax"]),
            cfg_binning["nbin"] + 1,
        )
        * u.TeV
    )

    # Handle theta square cut optimisation
    # (compute 68 % containment radius PSF if necessary)
    thsq_opt_type = cfg["analysis"]["thsq_opt"]["type"]
    if thsq_opt_type in "fixed":
        thsq_values = np.array([cfg["analysis"]["thsq_opt"]["value"]]) * u.deg
        print("Using fixed theta cut: {}".format(thsq_values))
    elif thsq_opt_type in "opti":
        thsq_values = np.arange(0.05, 0.40, 0.01) * u.deg
        print("Optimising theta cut for: {}".format(thsq_values))
    elif thsq_opt_type in "r68":
        print("Using R68% theta cut")
        print("Computing...")
        cfg_binning = cfg["analysis"]["ereco_binning"]
        ereco = (
            np.logspace(
                np.log10(cfg_binning["emin"]),
                np.log10(cfg_binning["emax"]),
                cfg_binning["nbin"] + 1,
            )
            * u.TeV
        )
        radius = 68

        thsq_values = list()
        for ibin in range(len(ereco) - 1):
            emin = ereco[ibin]
            emax = ereco[ibin + 1]

            energy_query = "reco_energy > {} and reco_energy <= {}".format(
                emin.value, emax.value
            )
            data = evt_dict["gamma"].query(energy_query).copy()

            min_stat = 0
            if len(data) <= min_stat:
                print("  ==> Not enough statistics:")
                print("To be handled...")
                thsq_values.append(0.3)
                continue
                # import sys
                # sys.exit()

            psf = np.percentile(data["offset"], radius)
            psf_err = psf / np.sqrt(len(data))

            thsq_values.append(psf)
        thsq_values = np.array(thsq_values) * u.deg
        # Set 0.05 as a lower value
        idx = np.where(thsq_values.value < 0.05)
        thsq_values[idx] = 0.05 * u.deg
        print("Using theta cut: {}".format(thsq_values))

    # Cuts optimisation
    print("### Finding best cuts...")
    cut_optimiser = CutsOptimisation(config=cfg, evt_dict=evt_dict, verbose_level=0)

    # Weight events
    print("- Weighting events...")
    cut_optimiser.weight_events(
        model_dict=model_dict, colname_mc_energy=cfg["column_definition"]["mc_energy"]
    )

    # Find best cutoff to reach best sensitivity
    print("- Estimating cutoffs...")
    cut_optimiser.find_best_cutoff(energy_values=ereco, angular_values=thsq_values)

    # Save results and auxiliary data for diagnostic
    print("- Saving results to disk...")
    cut_optimiser.write_results(
        outdir, "{}.fits".format(cfg["general"]["output_table_name"]), format="fits"
    )

    # Cuts diagnostic
    print("### Building cut diagnostics...")
    cut_diagnostic = CutsDiagnostic(config=cfg, indir=outdir)
    cut_diagnostic.plot_optimisation_summary()
    cut_diagnostic.plot_diagnostics()

    # Apply cuts and save data
    print("### Applying cuts to data...")
    cut_applicator = CutsApplicator(config=cfg, evt_dict=evt_dict, outdir=outdir)
    cut_applicator.apply_cuts()

    # Irf Maker
    print("### Building IRF...")
    irf_maker = IrfMaker(config=cfg, evt_dict=evt_dict, outdir=outdir)
    irf_maker.build_irf()

    # Sensitivity maker
    print("### Estimating sensitivity...")
    sensitivity_maker = SensitivityMaker(config=cfg, outdir=outdir)
    sensitivity_maker.load_irf()
    sensitivity_maker.estimate_sensitivity()