def test_training(): x = 'tests/data/dilepton/QSFUP/X_train_10.npy' y = 'tests/data/dilepton/QSFUP/y_train_10.npy' x0 = 'tests/data/dilepton/QSFUP/X0_train_10.npy' x1 = 'tests/data/dilepton/QSFUP/X1_train_10.npy' print("Loaded existing datasets ") estimator = RatioEstimator(n_hidden=(10, 10), activation="relu") estimator.train( method='carl', batch_size=1024, n_epochs=1, x=x, y=y, x0=x0, x1=x1, scale_inputs=True, )
if (X0.shape[0] != Y0.shape[0]): print('problem when loading, #labels does not match #events, exit') exit(1) if (X0_eventnum is not None): if (X0_eventnum.shape[0] != X0.shape[0]): print( 'problem when loading, #eventnumbers does not match #events, exit') exit(1) else: # some samples won't have an eventnumber, # but ATLAS ones should, since we use it to propagate the weight to reco-level events if (DEBUG): print("No eventnumber found in dataset.") # load model and evaluate weights: carl = RatioEstimator() if (DEBUG): print('Loading model from:', model_out_path) carl.load(model_out_path + '/carl/') r_hat, s_hat = carl.evaluate(X0) # prevent -ve weights (should be rounding only): r_hat = ensure_positive_weight(r_hat) # prevent 0-division r_hat = force_nonzero(r_hat, zero_w_bound) weights = 1. / r_hat # ensure <weights>=1 after cropping weights = weights * len(weights) / weights.sum() maxweight = -1
p = opts.datapath logger = logging.getLogger(__name__) if os.path.exists('data/' + sample + '/' + var + '/X_train_' + str(n) + '.npy'): logger.info( " Doing evaluation of model trained with datasets: %s , generator variation: %s with %s events.", sample, var, n) else: logger.info( " No datasets available for evaluation of model trained with datasets: %s , generator variation: %s with %s events.", sample, var, n) logger.info("ABORTING") sys.exit() loading = Loader() carl = RatioEstimator() carl.load('models/' + sample + '/' + var + '_carl_' + str(n)) evaluate = ['train', 'val'] for i in evaluate: r_hat, _ = carl.evaluate(x='data/' + sample + '/' + var + '/X0_' + i + '_' + str(n) + '.npy') w = 1. / r_hat loading.load_result( x0='data/' + sample + '/' + var + '/X0_' + i + '_' + str(n) + '.npy', x1='data/' + sample + '/' + var + '/X1_' + i + '_' + str(n) + '.npy', weights=w, label=i, do=sample, var=var, plot=True, n=n,
weightFeature=weightFeature, TreeName=treename, randomize=False, save=True, correlation=True, preprocessing=True, nentries=n, pathA=p + nominal + ".root", pathB=p + variation + ".root", ) logger.info(" Loaded new datasets ") ####################################### ####################################### # Estimate the likelihood ratio estimator = RatioEstimator(n_hidden=(10, 10, 10), activation="relu") estimator.train( method='carl', batch_size=1024, n_epochs=100, x=x, y=y, x0=x0, x1=x1, scale_inputs=True, ) estimator.save('models/' + global_name + '_carl_' + str(n), x, metaData, export_model=True) ########################################
logger = logging.getLogger(__name__) if os.path.exists('data/' + global_name + '/X_train_' + str(n) + '.npy') and os.path.exists('data/' + global_name + '/metaData_' + str(n) + '.pkl'): logger.info( " Doing calibration of model trained with datasets: [{},{}], with {} events.", nominal, variation, n) else: logger.info( " No datasets available for evaluation of model trained with datasets: [{},{}] with {} events." .format(nominal, variation, n)) logger.info("ABORTING") sys.exit() carl = RatioEstimator() carl.load('models/' + global_name + '_carl_' + str(n)) #load evaluate = ['train'] X = 'data/' + global_name + '/X_train_' + str(n) + '.npy' y = 'data/' + global_name + '/y_train_' + str(n) + '.npy' w = 'data/' + global_name + '/w_train_' + str(n) + '.npy' r_hat, s_hat = carl.evaluate(X) calib = CalibratedClassifier(carl, global_name=global_name) calib.fit(X=X, y=y, w=w) p0, p1, r_cal = calib.predict(X=X) w_cal = 1 / r_cal loading.load_calibration( y_true=y, p1_raw=s_hat, p1_cal=p1,
#carl-torch inference### #get the weight from carl-torch (weightCT) evaluated on the same model used for carlAthena and the root file from carlAthena eventVarsCT = ['Njets', 'MET'] eventVarsCA = ['Njets', 'MET', 'weight'] jetVars = ['Jet_Pt', 'Jet_Mass'] lepVars = ['Lepton_Pt'] xCT, _ = load(f=p + '/test.root', events=eventVarsCT, jets=jetVars, leps=lepVars, n=int(n), t='Tree', do=sample) xCT = xCT[sorted(xCT.columns)] carl = RatioEstimator() carl.load('models/' + sample + '/' + var + '_carl_2000001') r_hat, s_hat = carl.evaluate(x=xCT.to_numpy()) weightCT = 1. / r_hat ###carlAthena inference### #load sample with weight infered from carlAthena xCA, _ = load(f=p + '/test.root', events=eventVarsCA, jets=jetVars, leps=lepVars, n=int(n), t='Tree') weightCA = xCA.weight ###compare weights###
logger = logging.getLogger(__name__) if os.path.exists('data/' + global_name + '/X_train_' + str(n) + '.npy') and os.path.exists('data/' + global_name + '/metaData_' + str(n) + '.pkl'): logger.info( " Doing evaluation of model trained with datasets: [{}, {}], with {} events." .format(nominal, variation, n)) else: logger.info( " No datasets available for evaluation of model trained with datasets: [{},{}] with {} events." .format(nominal, variation, n)) logger.info("ABORTING") sys.exit() loading = Loader() carl = RatioEstimator() if model: carl.load(model) else: carl.load('models/' + global_name + '_carl_' + str(n)) evaluate = ['train', 'val'] for i in evaluate: print("<evaluate.py::__init__>:: Running evaluation for {}".format(i)) r_hat, s_hat = carl.evaluate(x='data/' + global_name + '/X0_' + i + '_' + str(n) + '.npy') print("s_hat = {}".format(s_hat)) print("r_hat = {}".format(r_hat)) w = 1. / r_hat # I thought r_hat = p_{1}(x) / p_{0}(x) ??? print("w = {}".format(w)) print("<evaluate.py::__init__>:: Loading Result for {}".format(i)) loading.load_result(
random_seed=random_seed) # load samples into carl-torch format loading = Loader_edb() x, y, x0, x1 = loading.loading(x0=data_x0, x1=data_x1, save=True, folder=data_out_path, randomize=False, random_seed=random_seed, val_frac=0.25, filter_outliers=True) print("Loaded new datasets ") # now the carl-torch part estimator = RatioEstimator(n_hidden=n_hidden, activation="relu") # pop event number, as this should not be used for training train_loss, val_loss = estimator.train( method='carl', batch_size=4096, n_epochs=n_epochs, x=x, y=y, x0=x0, x1=x1, scale_inputs=True, #early_stopping = True, #early_stopping_patience = 10 )
p = opts.datapath loading = Loader() logger = logging.getLogger(__name__) if os.path.exists('data/' + sample + '/' + var + '/X_train_' + str(n) + '.npy'): logger.info( " Doing calibration of model trained with datasets: %s , generator variation: %s with %s events.", sample, var, n) else: logger.info( " No datasets available for calibration of model trained with datasets: %s , generator variation: %s with %s events.", sample, var, n) logger.info("ABORTING") sys.exit() carl = RatioEstimator() carl.load('models/' + sample + '/' + var + '_carl_' + str(n)) #load evaluate = ['train'] X = 'data/' + sample + '/' + var + '/X_train_' + str(n) + '.npy' y = 'data/' + sample + '/' + var + '/y_train_' + str(n) + '.npy' r_hat, s_hat = carl.evaluate(X) calib = CalibratedClassifier(carl) calib.fit(X=X, y=y) p0, p1, r_cal = calib.predict(X=X) w_cal = 1 / r_cal loading.load_calibration( y_true=y, p1_raw=s_hat, p1_cal=p1, label='calibrated',
nentries=n, pathA=p + nominal + ".root", pathB=p + variation + ".root", noTar=True, normalise=False, debug=False, ) logger.info(" Loaded new datasets ") ####################################### ####################################### # Estimate the likelihood ratio using a NN model # -> Calculate number of input variables as rudimentary guess structure = ((len(features) * 3, ) * 5) # Use the number of inputs as input to the hidden layer structure estimator = RatioEstimator(n_hidden=(structure), activation="relu") estimator.scaling_method = scale_method # per epoch plotting intermediate_train_plot = None intermediate_save = None if per_epoch_plot: # arguments for training and validation sets for loading.load_result train_args = { "x0": x0, "x1": x1, "w0": w0, "w1": w1, "metaData": metaData, "features": features, "label": "train",
# prevent 0-division: # set this to very low, as we'll also filter large weights zero_w_bound = np.finfo(float).eps # crop outlier weights more than N sigma from average crop_weight_sigma = 5. # alternatively: crop X% of largest weight crop_weight_perc = -1. #----------------------------------------------------------------------------- if not os.path.exists(out_csv_dir): os.makedirs(out_csv_dir) carl = RatioEstimator() carl.load(model_out_path + '/carl/') evaluate = ['train', 'val'] for i in evaluate: x0 = data_out_path + '/X0_' + i + '.npy' r_hat, s_hat = carl.evaluate(x=data_out_path + '/X0_' + i + '.npy') ## print('what is Carl returning?') ## r=r_hat[0] ## s=s_hat[0] ## print('r=p0/p1,s=p0/(p0+p1)') ## print(r,s,r/(1+r)) ## print('r=p1/p0,s=p0/(p0+p1)') # this ## print(r,s,1/(1+r)) ## print('r=p0/p1,s=p1/(p0+p1)') # this