def trainBDT(self, targetSample): originDF = self.getTrainDataFrame() targetDF = targetSample.getTrainDataFrame() if self.observables.keys() != targetSample.observables.keys(): print 'Error observables for target and origin data sets do not match. Exiting...' print 'Origin:', self.observables.keys() print 'Target:', targetSample.observables.keys() exit(-1) originPreWeights = originDF["preweight"] targetPreWeights = targetDF["preweight"] originDF = originDF[self.observables.keys()] targetDF = targetDF[targetSample.observables.keys()] reweighter = GBReweighter(n_estimators=200, learning_rate=.1, max_depth=3, min_samples_leaf=1000, loss_regularization=1.0) reweighter.fit(original=originDF, target=targetDF, original_weight=originPreWeights, target_weight=targetPreWeights) with open(self.gbrwPath(), "wb") as f: pickle.dump(reweighter, f)
def train_reweighter(): extra_vars = [ gcm().ltime_var ] all_vars = gcm().phsp_vars + extra_vars columns = [v.var for v in all_vars if 'phi' not in v.var] columns += ['cosphi', 'sinphi'] # Current mode stuff data = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(data) data['cosphi'] = np.cos(data.phi1) data['sinphi'] = np.sin(data.phi1) df_sel = final_selection.get_final_selection() df_sel &= selection.delta_mass_signal_region() gen = get_model() gen['cosphi'] = np.cos(gen.phi1) gen['sinphi'] = np.sin(gen.phi1) limits = {v.var: v.binning[1:] for v in all_vars} limits['cosphi'] = (-1., 1) limits['sinphi'] = (-1., 1) for c in columns: mi, ma = limits[c] data[c] = (data[c] - mi) / (ma - mi) + 2. gen[c] = (gen[c] - mi) / (ma - mi) + 2. log.info('Training BDT reweighter for {}'.format(', '.join(columns))) reweighter = GBReweighter(n_estimators=300, max_depth=5, learning_rate=0.2) reweighter.fit(original=gen[columns].sample(n=250000), target=data[columns][df_sel].sample(n=250000)) bdt_utils.dump_reweighter(reweighter)
def gbreweight(weighttree, originaltree, name, variables, n=None): '''Use Hep_ml GBReweighter to calculate weights for weighttree to match originaltree in the given variables. Adds a friend with branch named 'name' of length 2: the first element is the calculated weight, the second is the product of that weight with any existing weight used for the weighttree (from the selection).''' from hep_ml.reweight import GBReweighter originalweights, originalvals = get_weights_and_vals( originaltree, variables, n) weightedweights, weightedvals = get_weights_and_vals( weighttree, variables, n) weighter = GBReweighter() print('Fit GBReweighter', name) weighter.fit(original=originalvals, original_weight=originalweights, target=weightedvals, target_weight=weightedweights) weight = weighttree.selection_functor() vals = weighttree.get_functor_list(variables) def get_weight(): _w = weighter.predict_weights([vals()])[0] return [_w, _w * weight()] print('Add weights for GBReweighter', name) weighttree.add_friend_tree(name, {name: dict(function=get_weight, length=2)})
reweighterEst20 = GBReweighter(n_estimators=20, #learning_rate=params['learning_rate'], max_depth=5, #min_samples_leaf=params['min_samples_leaf'], #loss_regularization=params['loss_regularization'] ) reweighterEst40 = GBReweighter(n_estimators=40, #learning_rate=params['learning_rate'], max_depth=5, #min_samples_leaf=params['min_samples_leaf'], #loss_regularization=params['loss_regularization'] ) log.info(f"Fitting weights...") reweighterEst10.fit(original = np.array([data_train['pho_eta'][trainMask & (data_train["label"] < 0.5)], data_train['pho_et'][trainMask & (data_train["label"] < 0.5)], data_train['correctedScaledActualMu'][trainMask & (data_train["label"] < 0.5)]]).T, target = np.array([data_train['pho_eta'][trainMask & (data_train["label"] >= 0.5)], data_train['pho_et'][trainMask & (data_train["label"] >= 0.5)], data_train['correctedScaledActualMu'][trainMask & (data_train["label"] >= 0.5)]]).T) reweighterEst20.fit(original = np.array([data_train['pho_eta'][trainMask & (data_train["label"] < 0.5)], data_train['pho_et'][trainMask & (data_train["label"] < 0.5)], data_train['correctedScaledActualMu'][trainMask & (data_train["label"] < 0.5)]]).T, target = np.array([data_train['pho_eta'][trainMask & (data_train["label"] >= 0.5)], data_train['pho_et'][trainMask & (data_train["label"] >= 0.5)], data_train['correctedScaledActualMu'][trainMask & (data_train["label"] >= 0.5)]]).T) reweighterEst40.fit(original = np.array([data_train['pho_eta'][trainMask & (data_train["label"] < 0.5)], data_train['pho_et'][trainMask & (data_train["label"] < 0.5)], data_train['correctedScaledActualMu'][trainMask & (data_train["label"] < 0.5)]]).T, target = np.array([data_train['pho_eta'][trainMask & (data_train["label"] >= 0.5)], data_train['pho_et'][trainMask & (data_train["label"] >= 0.5)], data_train['correctedScaledActualMu'][trainMask & (data_train["label"] >= 0.5)]]).T) log.info(f"Fitting of weights is done (time: {timedelta(seconds=time() - t)})")
signal_reweight_data = reweight_data_small.where(reweight_data['Signal'] == 1) signal_reweight_data_s_dropped = signal_reweight_data.drop(['Signal'], axis=1) signal_reweight_data_nan_s_dropped = signal_reweight_data_s_dropped.dropna( axis=0) background_reweight_data = reweight_data_small.where( reweight_data['Signal'] == 0) background_reweight_data_s_dropped = background_reweight_data.drop(['Signal'], axis=1) background_reweight_data_nan_s_dropped = background_reweight_data_s_dropped.dropna( axis=0) ratio = len(signal_reweight_data_nan_s_dropped) / len( background_reweight_data_nan_s_dropped) reweighter = GBReweighter(n_estimators=40) reweighter.fit(background_reweight_data_nan_s_dropped, signal_reweight_data_nan_s_dropped) weights = reweighter.predict_weights(background_reweight_data_nan_s_dropped) print(weights) total_weights = ratio * weights / np.mean(weights) #reweighted_background = background_reweight_data.multiply(weights, axis=0) fig_weight, ax_weight = plt.subplots(3, 2, figsize=(15, 15)) ax_weight[0, 0].hist(signal_reweight_data_nan_s_dropped.p_et_calo.ravel(), bins=50, range=(0, 100000), color='r', alpha=0.5, label="Signal")
pre_separation.fit(subtrain[reweight_feats], subtrain[['isE']], sample_weight=subtrain.weight) test_proba = pre_separation.predict_proba(subtest[reweight_feats])[:, 1] roc_pre = roc_curve(subtest[['isE']], test_proba, sample_weight=subtest.weight)[:2] auc_pre = roc_auc_score(subtest[['isE']], test_proba, sample_weight=subtest.weight) #run reweighting -- not working on MC for some reason reweighter = GBReweighter(n_estimators=1 if debug else 30, max_depth=4, learning_rate=0.1) reweighter.fit(subtrain[subtrain.isE == 1][reweight_feats], subtrain[ subtrain.isE == 0][reweight_feats]) #make electrons look like tracks #run weights FOR EVERYTHING! for df in [data, subtrain, subtest]: weights = reweighter.predict_weights( df[df.isE == 1][reweight_feats]) #1/w to be used df.loc[df.isE == 1, 'weight'] = weights #save reweighter joblib.dump(reweighter, reweight_model_file, compress=True) # Check that sepratation vanishes post_separation = GradientBoostingClassifier( n_estimators=1 if debug else 50, max_depth=4, random_state=42,
str_to_parent_folder=str(Path(__file__).resolve().parent.parent) tree = uproot.open(str_to_parent_folder+"/forward_MC/user.lehrke.mc16_13TeV.361106.Zee.EGAM8.e3601_e5984_s3126_r10724_r10726_p3648.ePID18_NTUP3_v01_myOutput.root/user.lehrke.17118381._000003.myOutput.root")[b'tree;1'] reweight_data = tree.pandas.df(["averageInteractionsPerCrossing","p_et_calo","p_eta","p_TruthType"]) reweight_data_small = reweight_data.sample(frac=1,replace=False, random_state=42) p_TruthType_reweight_data = reweight_data_small.where(reweight_data_small['p_TruthType'] == 2) p_TruthType_reweight_data_s_dropped = p_TruthType_reweight_data.drop(['p_TruthType'], axis=1) p_TruthType_reweight_data_nan_s_dropped = p_TruthType_reweight_data_s_dropped.dropna(axis=0) background_reweight_data = reweight_data_small.where(reweight_data['p_TruthType'] !=2) background_reweight_data_s_dropped = background_reweight_data.drop(['p_TruthType'], axis=1) background_reweight_data_nan_s_dropped =background_reweight_data_s_dropped.dropna(axis=0) ratio=len(p_TruthType_reweight_data_nan_s_dropped)/len(background_reweight_data_nan_s_dropped) reweighter = GBReweighter(n_estimators=40) reweighter.fit(background_reweight_data_nan_s_dropped, p_TruthType_reweight_data_nan_s_dropped) weights = reweighter.predict_weights(background_reweight_data_nan_s_dropped) print(weights) total_weights=ratio*weights /np.mean(weights) np.savetxt('../weights/weights_MC_03.csv', total_weights, delimiter=',') #reweighted_background = background_reweight_data.multiply(weights, axis=0) """ fig_weight, ax_weight = plt.subplots(3,2, figsize=(15,15)) ax_weight[0,0].hist(p_TruthType_reweight_data_nan_s_dropped.p_et_calo.ravel(),bins=50,range=(0,100000), color = 'r', alpha = 0.5, label = "p_TruthType") ax_weight[0,0].hist(background_reweight_data_nan_s_dropped.p_et_calo.ravel(), bins=50,range=(0,100000), color = 'blue', alpha = 0.5, label = "Background") ax_weight[0,0].legend(loc="upper right") ax_weight[0,0].set_title('P_et_calo (before weight)')
n_estimators=17, #learning_rate=params['learning_rate'], max_depth=5, #min_samples_leaf=params['min_samples_leaf'], #loss_regularization=params['loss_regularization'] ) # Create weight estimators and fit them to the data log.info(f"Fitting weights...") reweighterEst10.fit( original=np.array([ data_train['muo_eta'][trainMask & (data_train["label"] < 0.5)], data_train['muo_pt'][trainMask & (data_train["label"] < 0.5)], data_train['correctedScaledAverageMu'][trainMask & (data_train["label"] < 0.5)] ]).T, target=np.array([ data_train['muo_eta'][trainMask & (data_train["label"] >= 0.5)], data_train['muo_pt'][trainMask & (data_train["label"] >= 0.5)], data_train['correctedScaledAverageMu'][trainMask & (data_train["label"] >= 0.5)] ]).T) reweighterEst20.fit( original=np.array([ data_train['muo_eta'][trainMask & (data_train["label"] < 0.5)], data_train['muo_pt'][trainMask & (data_train["label"] < 0.5)], data_train['correctedScaledAverageMu'][trainMask & (data_train["label"] < 0.5)] ]).T, target=np.array([ data_train['muo_eta'][trainMask & (data_train["label"] >= 0.5)],
# Read the decay times from the LHCb simulation - I've serialised it here print("reading pickle") with open("mc_times.pickle", "rb") as f: mc_times = pickle.load(f) # Generate some random numbers from an exponential distribution with the right decay constant d_lifetime_ps = 0.49 N = len(mc_times) print("gen times") exp_times = np.random.exponential(d_lifetime_ps, N) mc_train, mc_test, model_train, model_test = train_test_split( mc_times, exp_times) bdt = GBReweighter() print("Training bdt") bdt.fit(original=model_train, target=mc_train) weights = bdt.predict_weights(model_test) kw = {"bins": np.linspace(0.0, 9.0, 100), "alpha": 0.3, "density": True} plt.figure(figsize=(12.0, 9.0)) plt.hist(mc_test, label="Original", **kw) plt.hist(model_test, label="Target", **kw) plt.hist(model_test, label="Target Weighted", weights=weights, **kw) plt.legend() plt.xlabel("Time /ps") plt.ylabel("Counts") plt.savefig("mwe.png")
print "Performing train-test split" original_train, original_test = train_test_split(original) else: original_train = original original_test = original original_weight_distribution_train = original_train[original_weights] if original_weights != None else None target_weight_distribution = target[target_weights] if target_weights != None else None #Start the training gb = GBReweighter(**gb_settings) print( "Doing training of GBReweighter..." ) print( "Re-weighting variables: {}".format(reweighting_branches) ) begin = time.time() gb_output = gb.fit(original_train[reweighting_branches], target[reweighting_branches], original_weight = original_weight_distribution_train, target_weight = target_weight_distribution) print( "Settings: {}".format(gb_output) ) print( "Training of GBReweighter took {:.2f} seconds".format(time.time()-begin) ) #Save the classifier as pickle if applicable if options.Save: ensure_dir(options.Save) with open(options.Save, 'wb') as handle: pickle_file = {"GBReweighter" : gb, "Branches" : reweighting_branches} pickle.dump(pickle_file, handle) print( "GBReweighter saved to {}".format(options.Save) )
axis=1) # CREATING SAMPLE WEIGHTS # (https://arogozhnikov.github.io/hep_ml/reweight.html) res_cols = list(tm_revenue.reset_index().columns) resampling_b = datab[['Segment Knicks'] + res_cols] resampling_h = datah[['Segment Rangers'] + res_cols] sampleb = resampling_b.dropna(subset=['Segment Knicks']).drop( ['Segment Knicks'], axis=1).set_index('email') fullb = resampling_b[pd.isnull(resampling_b['Segment Knicks'])].drop( ['Segment Knicks'], axis=1).set_index('email') sampleh = resampling_h.dropna(subset=['Segment Rangers']).drop( ['Segment Rangers'], axis=1).set_index('email') fullh = resampling_h[pd.isnull(resampling_h['Segment Rangers'])].drop( ['Segment Rangers'], axis=1).set_index('email') reweighter = GBReweighter() sampleb['weight'] = reweighter.fit( original=sampleb, target=fullb).predict_weights(sampleb).round(3) sampleh['weight'] = reweighter.fit( original=sampleh, target=fullh).predict_weights(sampleh).round(3) # LOGIT MODELING # modeling_bball = pd.merge(data[data['vspt'] == 'basketball'], sampleb['weight'].reset_index(), on='email').drop( ['vspt', 'Sample', 'email', 'Segment Rangers'], axis=1).set_index('uid') modeling_hockey = pd.merge(data[data['vspt'] == 'hockey'], sampleh['weight'].reset_index(), on='email').drop( ['vspt', 'Sample', 'email', 'Segment Knicks'], axis=1).set_index('uid')
n_estimators=params['n_estimators'], learning_rate=params['learning_rate'], max_depth=params['max_depth'], min_samples_leaf=params['min_samples_leaf'], loss_regularization=params['loss_regularization']) # Create weight estimators and fit them to the data log.info(f"Fitting weights...") reweighter.fit( original=np.array([ data_train['eta'][trainMask & (data_train["label"] < 0.5)], data_train['pt'][trainMask & (data_train["label"] < 0.5)], data_train['invM'][trainMask & (data_train["label"] < 0.5)], data_train['correctedScaledActualMu'][ trainMask & (data_train["label"] < 0.5)] ]).T, target=np.array([ data_train['eta'][trainMask & (data_train["label"] >= 0.5)], data_train['pt'][trainMask & (data_train["label"] >= 0.5)], data_train['invM'][trainMask & (data_train["label"] >= 0.5)], data_train['correctedScaledActualMu'][ trainMask & (data_train["label"] >= 0.5)] ]).T) log.info( f"Fitting of weights is done (time: {timedelta(seconds=time() - t)})" ) # Get weights log.info(f"Get weights for training, validation and test set") weight_train = getRegularWeights("train", reweighter, data_train[trainMask])