def evaluateModels(**parsed_args): logger = logging.getLogger('EvalModel') # log arguments for argkey, argvalue in sorted(parsed_args.items()): if argvalue is None: continue logger.info('Argument {}: {}'.format(argkey, argvalue)) ################# # Variables ################# observable_dict = read_dict_from_json(parsed_args['observable_config']) logger.info("Features used in training: {}".format(', '.join( parsed_args['observables']))) # detector level vars_det = [ observable_dict[key]['branch_det'] for key in parsed_args['observables'] ] # truth level vars_mc = [ observable_dict[key]['branch_mc'] for key in parsed_args['observables'] ] # event weights wname = parsed_args['weight'] ################# # Load data ################# logger.info("Loading data") fnames_d = parsed_args['data'] logger.info("(Pseudo) data files: {}".format(' '.join(fnames_d))) dataHandle = DataHandler(fnames_d, wname, variable_names=vars_det + vars_mc) logger.info("Total number of pseudo data events: {}".format( dataHandle.get_nevents())) fnames_s = parsed_args['signal'] logger.info("Simulation files: {}".format(' '.join(fnames_s))) simHandle = DataHandler(fnames_s, wname, variable_names=vars_det + vars_mc) logger.info("Total number of simulation events: {}".format( simHandle.get_nevents())) #### #dataHandle = DataToy(1000000, 1, 1.5) #simHandle = DataToy(1000000, 0, 1) #vars_mc = ['x_truth'] #### ################# # Event weights # pseudo data weights w_d = dataHandle.get_weights(rw_type=parsed_args['reweight_data'], vars_dict=observable_dict) # prior simulation weights w_s = simHandle.get_weights() # normalize simulation weights to pseudo data ndata = w_d.sum() nsim = w_s.sum() w_s *= ndata / nsim ################# # Input datasets ################# # Training arrays # Truth level # FIXME hard code input variables for pfn for now if parsed_args['model_name'] == 'pfn': vars_mc = [['th_pt_MC', 'th_y_MC', 'th_phi_MC', 'th_e_MC'], ['tl_pt_MC', 'tl_y_MC', 'tl_phi_MC', 'tl_e_MC']] X, Y, w = get_training_inputs(vars_mc, dataHandle, simHandle, rw_type=parsed_args['reweight_data'], vars_dict=observable_dict) # Split into training, validation, and test sets: 75%, 15%, 10% X_train, X_test, Y_train, Y_test, w_train, w_test = train_test_split( X, Y, w, test_size=0.25) X_val, X_test, Y_val, Y_test, w_val, w_test = train_test_split( X_test, Y_test, w_test, test_size=0.4) ################# # Train model and reweight simulation weights_rw = [] for i in range(parsed_args['nrun']): logger.info("RUN {}".format(i)) model_dir = os.path.join(parsed_args['outputdir'], 'Models_{}'.format(i)) model = train_model((X_train, Y_train, w_train), (X_val, Y_val, w_val), (X_test, Y_test, w_test), model_name=parsed_args['model_name'], model_dir=model_dir, batch_size=parsed_args['batch_size'], load_model=parsed_args['load_model']) # Reweight simulation to the truth in pseudo data # reweighting factors X_prior = X[np.argmax(Y, axis=1) == 0] lr = reweight(model, X_prior) logger.info("Plot distribution of reweighitng factors") fname_hlr = os.path.join(model_dir, 'rhist') plotting.plot_LR_distr(fname_hlr, [lr]) # New weights for simulation weights_rw.append(w_s * lr) ################# # Compare reweighted simulation prior to pseudo truth w_s_rw = weights_rw[0] for varname in parsed_args['observables']: logger.info(varname) bins = get_bins(varname, parsed_args['binning_config']) vname_mc = observable_dict[varname]['branch_mc'] # pseudo truth hist_truth, hist_truth_err = dataHandle.get_histogram( vname_mc, w_d, bins) # simulation prior hist_prior, hist_prior_err = simHandle.get_histogram( vname_mc, w_s, bins) # reweighted simulation distributions hists_rw, hists_rw_err = simHandle.get_histogram( vname_mc, weights_rw, bins) # plot the first reweighted distribution assert (len(hists_rw) > 0) hist_rw = hists_rw[0] hist_rw_err = hists_rw_err[0] #hist_rw = np.mean(np.asarray(hists_rw), axis=0) #hist_rw_err = np.std(np.asarray(hists_rw), axis=0, ddof=1) # plot histograms and their ratio figname = os.path.join(parsed_args['outputdir'], 'Reweight_{}'.format(varname)) logger.info("Plot reweighted distribution: {}".format(figname)) # Compute chi2s text_chi2 = write_chi2(hist_truth, hist_truth_err, [hist_rw, hist_prior], [hist_rw_err, hist_truth_err], labels=['Reweighted', 'Prior']) logger.info(" " + " ".join(text_chi2)) # Compute triangular discriminator text_tria = write_triangular_discriminators( hist_truth, [hist_rw, hist_prior], labels=['Reweighted', 'Prior']) logger.info(" " + " ".join(text_tria)) # Compute KS test statistic arr_truth = dataHandle.get_variable_arr(vname_mc) arr_sim = simHandle.get_variable_arr(vname_mc) text_ks = write_ks(arr_truth, w_d, [arr_sim, arr_sim], [w_s_rw, w_s], labels=['Reweighted', 'Prior']) logger.info(" " + " ".join(text_ks)) plotting.plot_results(bins, (hist_prior, hist_prior_err), (hist_rw, hist_rw_err), histogram_truth=(hist_truth, hist_truth_err), figname=figname, texts=text_ks, **observable_dict[varname]) #### # plot all trials if len(hists_rw) > 1: figname_all = os.path.join(parsed_args['outputdir'], 'Reweight_{}_allruns'.format(varname)) plotting.plot_hists_resamples(figname_all, bins, hists_rw, hist_prior, hist_truth, **observable_dict[varname]) # plot the distribution of KS test statistic ks_list = [] for rw_s in weights_rw: ks = ks_2samp_weighted(arr_truth, arr_sim, w_d, rw_s)[0] ks_list.append(ks) hist_ks, bins_ks = np.histogram(ks_list) fname_ks = os.path.join(parsed_args['outputdir'], 'KSDistr_{}'.format(varname)) plotting.plot_histograms1d(fname_ks, bins_ks, [hist_ks], xlabel="KS")