signal_reweight_data_nan_s_dropped = signal_reweight_data_s_dropped.dropna( axis=0) background_reweight_data = reweight_data_small.where( reweight_data['Signal'] == 0) background_reweight_data_s_dropped = background_reweight_data.drop(['Signal'], axis=1) background_reweight_data_nan_s_dropped = background_reweight_data_s_dropped.dropna( axis=0) ratio = len(signal_reweight_data_nan_s_dropped) / len( background_reweight_data_nan_s_dropped) reweighter = GBReweighter(n_estimators=40) reweighter.fit(background_reweight_data_nan_s_dropped, signal_reweight_data_nan_s_dropped) weights = reweighter.predict_weights(background_reweight_data_nan_s_dropped) print(weights) total_weights = ratio * weights / np.mean(weights) #reweighted_background = background_reweight_data.multiply(weights, axis=0) fig_weight, ax_weight = plt.subplots(3, 2, figsize=(15, 15)) ax_weight[0, 0].hist(signal_reweight_data_nan_s_dropped.p_et_calo.ravel(), bins=50, range=(0, 100000), color='r', alpha=0.5, label="Signal") ax_weight[0, 0].hist(background_reweight_data_nan_s_dropped.p_et_calo.ravel(),
# Read the decay times from the LHCb simulation - I've serialised it here print("reading pickle") with open("mc_times.pickle", "rb") as f: mc_times = pickle.load(f) # Generate some random numbers from an exponential distribution with the right decay constant d_lifetime_ps = 0.49 N = len(mc_times) print("gen times") exp_times = np.random.exponential(d_lifetime_ps, N) mc_train, mc_test, model_train, model_test = train_test_split( mc_times, exp_times) bdt = GBReweighter() print("Training bdt") bdt.fit(original=model_train, target=mc_train) weights = bdt.predict_weights(model_test) kw = {"bins": np.linspace(0.0, 9.0, 100), "alpha": 0.3, "density": True} plt.figure(figsize=(12.0, 9.0)) plt.hist(mc_test, label="Original", **kw) plt.hist(model_test, label="Target", **kw) plt.hist(model_test, label="Target Weighted", weights=weights, **kw) plt.legend() plt.xlabel("Time /ps") plt.ylabel("Counts") plt.savefig("mwe.png")
test_proba, sample_weight=subtest.weight)[:2] auc_pre = roc_auc_score(subtest[['isE']], test_proba, sample_weight=subtest.weight) #run reweighting -- not working on MC for some reason reweighter = GBReweighter(n_estimators=1 if debug else 30, max_depth=4, learning_rate=0.1) reweighter.fit(subtrain[subtrain.isE == 1][reweight_feats], subtrain[ subtrain.isE == 0][reweight_feats]) #make electrons look like tracks #run weights FOR EVERYTHING! for df in [data, subtrain, subtest]: weights = reweighter.predict_weights( df[df.isE == 1][reweight_feats]) #1/w to be used df.loc[df.isE == 1, 'weight'] = weights #save reweighter joblib.dump(reweighter, reweight_model_file, compress=True) # Check that sepratation vanishes post_separation = GradientBoostingClassifier( n_estimators=1 if debug else 50, max_depth=4, random_state=42, verbose=1) post_separation.fit(subtrain[reweight_feats], subtrain[['isE']], sample_weight=subtrain.weight) test_proba = post_separation.predict_proba(subtest[reweight_feats])[:, 1]
pickle.dump(pickle_file, handle) print( "GBReweighter saved to {}".format(options.Save) ) #Plotting if applicable if options.Plots != False: #Prediction of the training data for comparison print( "Predicting with the just trained model" ) original_weight_distribution_test = original_test[original_weights] if original_weights != None else np.ones(len(original_test)) original_weight_distribution_train = original_train[original_weights] if original_weights != None else np.ones(len(original_train)) target_weight_distribution = target[target_weights] if target_weights != None else np.ones(len(target)) calculated_weights_test = gb.predict_weights(original_test[reweighting_branches], original_weight_distribution_test) calculated_weights_train = gb.predict_weights(original_train[reweighting_branches], original_weight_distribution_train) #Some output about the calculated weights #normalise to number of original entries n_entries_test = len(original_test) calculated_weights_scaled_test = calculated_weights_test * n_entries_test / np.sum(calculated_weights_test) print( "Entries in original (test-)dataset: {}".format(n_entries_test) ) print( "Sum of calculated weights: {}").format(np.sum(calculated_weights_scaled_test) ) max_weight = np.max(calculated_weights_scaled_test) print( "Maximum weight: {} ({:%} of entries)".format(max_weight, max_weight/n_entries_test) ) #Normalisation for training as well calculated_weights_scaled_train = calculated_weights_train * len(original_train) / np.sum(calculated_weights_train)