Ejemplo n.º 1
0
def evaluateModels(**parsed_args):

    logger = logging.getLogger('EvalModel')

    # log arguments
    for argkey, argvalue in sorted(parsed_args.items()):
        if argvalue is None:
            continue
        logger.info('Argument {}: {}'.format(argkey, argvalue))

    #################
    # Variables
    #################
    observable_dict = read_dict_from_json(parsed_args['observable_config'])

    logger.info("Features used in training: {}".format(', '.join(
        parsed_args['observables'])))
    # detector level
    vars_det = [
        observable_dict[key]['branch_det']
        for key in parsed_args['observables']
    ]
    # truth level
    vars_mc = [
        observable_dict[key]['branch_mc'] for key in parsed_args['observables']
    ]

    # event weights
    wname = parsed_args['weight']

    #################
    # Load data
    #################
    logger.info("Loading data")

    fnames_d = parsed_args['data']
    logger.info("(Pseudo) data files: {}".format(' '.join(fnames_d)))
    dataHandle = DataHandler(fnames_d,
                             wname,
                             variable_names=vars_det + vars_mc)
    logger.info("Total number of pseudo data events: {}".format(
        dataHandle.get_nevents()))

    fnames_s = parsed_args['signal']
    logger.info("Simulation files: {}".format(' '.join(fnames_s)))
    simHandle = DataHandler(fnames_s, wname, variable_names=vars_det + vars_mc)
    logger.info("Total number of simulation events: {}".format(
        simHandle.get_nevents()))

    ####
    #dataHandle = DataToy(1000000, 1, 1.5)
    #simHandle = DataToy(1000000, 0, 1)
    #vars_mc = ['x_truth']
    ####

    #################
    # Event weights
    # pseudo data weights
    w_d = dataHandle.get_weights(rw_type=parsed_args['reweight_data'],
                                 vars_dict=observable_dict)

    # prior simulation weights
    w_s = simHandle.get_weights()

    # normalize simulation weights to pseudo data
    ndata = w_d.sum()
    nsim = w_s.sum()
    w_s *= ndata / nsim

    #################
    # Input datasets
    #################
    # Training arrays
    # Truth level

    # FIXME hard code input variables for pfn for now
    if parsed_args['model_name'] == 'pfn':
        vars_mc = [['th_pt_MC', 'th_y_MC', 'th_phi_MC', 'th_e_MC'],
                   ['tl_pt_MC', 'tl_y_MC', 'tl_phi_MC', 'tl_e_MC']]

    X, Y, w = get_training_inputs(vars_mc,
                                  dataHandle,
                                  simHandle,
                                  rw_type=parsed_args['reweight_data'],
                                  vars_dict=observable_dict)

    # Split into training, validation, and test sets: 75%, 15%, 10%
    X_train, X_test, Y_train, Y_test, w_train, w_test = train_test_split(
        X, Y, w, test_size=0.25)
    X_val, X_test, Y_val, Y_test, w_val, w_test = train_test_split(
        X_test, Y_test, w_test, test_size=0.4)

    #################
    # Train model and reweight simulation
    weights_rw = []
    for i in range(parsed_args['nrun']):
        logger.info("RUN {}".format(i))

        model_dir = os.path.join(parsed_args['outputdir'],
                                 'Models_{}'.format(i))

        model = train_model((X_train, Y_train, w_train), (X_val, Y_val, w_val),
                            (X_test, Y_test, w_test),
                            model_name=parsed_args['model_name'],
                            model_dir=model_dir,
                            batch_size=parsed_args['batch_size'],
                            load_model=parsed_args['load_model'])

        # Reweight simulation to the truth in pseudo data
        # reweighting factors
        X_prior = X[np.argmax(Y, axis=1) == 0]
        lr = reweight(model, X_prior)

        logger.info("Plot distribution of reweighitng factors")
        fname_hlr = os.path.join(model_dir, 'rhist')
        plotting.plot_LR_distr(fname_hlr, [lr])

        # New weights for simulation
        weights_rw.append(w_s * lr)

    #################
    # Compare reweighted simulation prior to pseudo truth

    w_s_rw = weights_rw[0]

    for varname in parsed_args['observables']:
        logger.info(varname)
        bins = get_bins(varname, parsed_args['binning_config'])
        vname_mc = observable_dict[varname]['branch_mc']

        # pseudo truth
        hist_truth, hist_truth_err = dataHandle.get_histogram(
            vname_mc, w_d, bins)

        # simulation prior
        hist_prior, hist_prior_err = simHandle.get_histogram(
            vname_mc, w_s, bins)

        # reweighted simulation distributions
        hists_rw, hists_rw_err = simHandle.get_histogram(
            vname_mc, weights_rw, bins)

        # plot the first reweighted distribution
        assert (len(hists_rw) > 0)
        hist_rw = hists_rw[0]
        hist_rw_err = hists_rw_err[0]
        #hist_rw = np.mean(np.asarray(hists_rw), axis=0)
        #hist_rw_err = np.std(np.asarray(hists_rw), axis=0, ddof=1)

        # plot histograms and their ratio
        figname = os.path.join(parsed_args['outputdir'],
                               'Reweight_{}'.format(varname))
        logger.info("Plot reweighted distribution: {}".format(figname))

        # Compute chi2s
        text_chi2 = write_chi2(hist_truth,
                               hist_truth_err, [hist_rw, hist_prior],
                               [hist_rw_err, hist_truth_err],
                               labels=['Reweighted', 'Prior'])
        logger.info("  " + "    ".join(text_chi2))

        # Compute triangular discriminator
        text_tria = write_triangular_discriminators(
            hist_truth, [hist_rw, hist_prior], labels=['Reweighted', 'Prior'])
        logger.info("  " + "    ".join(text_tria))

        # Compute KS test statistic
        arr_truth = dataHandle.get_variable_arr(vname_mc)
        arr_sim = simHandle.get_variable_arr(vname_mc)
        text_ks = write_ks(arr_truth,
                           w_d, [arr_sim, arr_sim], [w_s_rw, w_s],
                           labels=['Reweighted', 'Prior'])

        logger.info("  " + "    ".join(text_ks))

        plotting.plot_results(bins, (hist_prior, hist_prior_err),
                              (hist_rw, hist_rw_err),
                              histogram_truth=(hist_truth, hist_truth_err),
                              figname=figname,
                              texts=text_ks,
                              **observable_dict[varname])

        ####
        # plot all trials
        if len(hists_rw) > 1:
            figname_all = os.path.join(parsed_args['outputdir'],
                                       'Reweight_{}_allruns'.format(varname))
            plotting.plot_hists_resamples(figname_all, bins, hists_rw,
                                          hist_prior, hist_truth,
                                          **observable_dict[varname])

        # plot the distribution of KS test statistic
        ks_list = []
        for rw_s in weights_rw:
            ks = ks_2samp_weighted(arr_truth, arr_sim, w_d, rw_s)[0]
            ks_list.append(ks)
        hist_ks, bins_ks = np.histogram(ks_list)
        fname_ks = os.path.join(parsed_args['outputdir'],
                                'KSDistr_{}'.format(varname))
        plotting.plot_histograms1d(fname_ks, bins_ks, [hist_ks], xlabel="KS")