Beispiel #1
0
    def trainBDT(self, targetSample):

        originDF = self.getTrainDataFrame()
        targetDF = targetSample.getTrainDataFrame()

        if self.observables.keys() != targetSample.observables.keys():
            print 'Error observables for target and origin data sets do not match. Exiting...'
            print 'Origin:', self.observables.keys()
            print 'Target:', targetSample.observables.keys()
            exit(-1)

        originPreWeights = originDF["preweight"]
        targetPreWeights = targetDF["preweight"]

        originDF = originDF[self.observables.keys()]
        targetDF = targetDF[targetSample.observables.keys()]

        reweighter = GBReweighter(n_estimators=200,
                                  learning_rate=.1,
                                  max_depth=3,
                                  min_samples_leaf=1000,
                                  loss_regularization=1.0)

        reweighter.fit(original=originDF,
                       target=targetDF,
                       original_weight=originPreWeights,
                       target_weight=targetPreWeights)

        with open(self.gbrwPath(), "wb") as f:
            pickle.dump(reweighter, f)
Beispiel #2
0
def train_reweighter():
    extra_vars = [
        gcm().ltime_var
    ]
    all_vars = gcm().phsp_vars + extra_vars
    columns = [v.var for v in all_vars if 'phi' not in v.var]
    columns += ['cosphi', 'sinphi']

    # Current mode stuff
    data = gcm().get_data([f.var for f in extra_vars])
    add_variables.append_phsp(data)
    data['cosphi'] = np.cos(data.phi1)
    data['sinphi'] = np.sin(data.phi1)
    df_sel = final_selection.get_final_selection()
    df_sel &= selection.delta_mass_signal_region()

    gen = get_model()
    gen['cosphi'] = np.cos(gen.phi1)
    gen['sinphi'] = np.sin(gen.phi1)

    limits = {v.var: v.binning[1:] for v in all_vars}
    limits['cosphi'] = (-1., 1)
    limits['sinphi'] = (-1., 1)
    for c in columns:
        mi, ma = limits[c]
        data[c] = (data[c] - mi) / (ma - mi) + 2.
        gen[c] = (gen[c] - mi) / (ma - mi) + 2.

    log.info('Training BDT reweighter for {}'.format(', '.join(columns)))
    reweighter = GBReweighter(n_estimators=300, max_depth=5, learning_rate=0.2)

    reweighter.fit(original=gen[columns].sample(n=250000),
                   target=data[columns][df_sel].sample(n=250000))
    bdt_utils.dump_reweighter(reweighter)
Beispiel #3
0
def gbreweight(weighttree, originaltree, name, variables, n=None):
    '''Use Hep_ml GBReweighter to calculate weights for weighttree to match originaltree in the given variables.
    Adds a friend with branch named 'name' of length 2: the first element is the calculated weight, the second is
    the product of that weight with any existing weight used for the weighttree (from the selection).'''
    from hep_ml.reweight import GBReweighter

    originalweights, originalvals = get_weights_and_vals(
        originaltree, variables, n)
    weightedweights, weightedvals = get_weights_and_vals(
        weighttree, variables, n)

    weighter = GBReweighter()
    print('Fit GBReweighter', name)
    weighter.fit(original=originalvals,
                 original_weight=originalweights,
                 target=weightedvals,
                 target_weight=weightedweights)
    weight = weighttree.selection_functor()
    vals = weighttree.get_functor_list(variables)

    def get_weight():
        _w = weighter.predict_weights([vals()])[0]
        return [_w, _w * weight()]

    print('Add weights for GBReweighter', name)
    weighttree.add_friend_tree(name,
                               {name: dict(function=get_weight, length=2)})
Beispiel #4
0
reweighterEst20  = GBReweighter(n_estimators=20,
                           #learning_rate=params['learning_rate'],
                           max_depth=5,
                           #min_samples_leaf=params['min_samples_leaf'],
                           #loss_regularization=params['loss_regularization']
                           )
reweighterEst40  = GBReweighter(n_estimators=40,
                           #learning_rate=params['learning_rate'],
                           max_depth=5,
                           #min_samples_leaf=params['min_samples_leaf'],
                           #loss_regularization=params['loss_regularization']
                           )
log.info(f"Fitting weights...")
reweighterEst10.fit(original = np.array([data_train['pho_eta'][trainMask & (data_train["label"] < 0.5)],
                                    data_train['pho_et'][trainMask & (data_train["label"] < 0.5)],
                                    data_train['correctedScaledActualMu'][trainMask & (data_train["label"] < 0.5)]]).T,
               target   = np.array([data_train['pho_eta'][trainMask & (data_train["label"] >= 0.5)],
                                    data_train['pho_et'][trainMask & (data_train["label"] >= 0.5)],
                                    data_train['correctedScaledActualMu'][trainMask & (data_train["label"] >= 0.5)]]).T)
reweighterEst20.fit(original = np.array([data_train['pho_eta'][trainMask & (data_train["label"] < 0.5)],
                                    data_train['pho_et'][trainMask & (data_train["label"] < 0.5)],
                                    data_train['correctedScaledActualMu'][trainMask & (data_train["label"] < 0.5)]]).T,
               target   = np.array([data_train['pho_eta'][trainMask & (data_train["label"] >= 0.5)],
                                    data_train['pho_et'][trainMask & (data_train["label"] >= 0.5)],
                                    data_train['correctedScaledActualMu'][trainMask & (data_train["label"] >= 0.5)]]).T)
reweighterEst40.fit(original = np.array([data_train['pho_eta'][trainMask & (data_train["label"] < 0.5)],
                                    data_train['pho_et'][trainMask & (data_train["label"] < 0.5)],
                                    data_train['correctedScaledActualMu'][trainMask & (data_train["label"] < 0.5)]]).T,
               target   = np.array([data_train['pho_eta'][trainMask & (data_train["label"] >= 0.5)],
                                    data_train['pho_et'][trainMask & (data_train["label"] >= 0.5)],
                                    data_train['correctedScaledActualMu'][trainMask & (data_train["label"] >= 0.5)]]).T)
log.info(f"Fitting of weights is done (time: {timedelta(seconds=time() - t)})")
signal_reweight_data = reweight_data_small.where(reweight_data['Signal'] == 1)
signal_reweight_data_s_dropped = signal_reweight_data.drop(['Signal'], axis=1)
signal_reweight_data_nan_s_dropped = signal_reweight_data_s_dropped.dropna(
    axis=0)
background_reweight_data = reweight_data_small.where(
    reweight_data['Signal'] == 0)
background_reweight_data_s_dropped = background_reweight_data.drop(['Signal'],
                                                                   axis=1)
background_reweight_data_nan_s_dropped = background_reweight_data_s_dropped.dropna(
    axis=0)

ratio = len(signal_reweight_data_nan_s_dropped) / len(
    background_reweight_data_nan_s_dropped)

reweighter = GBReweighter(n_estimators=40)
reweighter.fit(background_reweight_data_nan_s_dropped,
               signal_reweight_data_nan_s_dropped)
weights = reweighter.predict_weights(background_reweight_data_nan_s_dropped)
print(weights)

total_weights = ratio * weights / np.mean(weights)

#reweighted_background = background_reweight_data.multiply(weights, axis=0)

fig_weight, ax_weight = plt.subplots(3, 2, figsize=(15, 15))

ax_weight[0, 0].hist(signal_reweight_data_nan_s_dropped.p_et_calo.ravel(),
                     bins=50,
                     range=(0, 100000),
                     color='r',
                     alpha=0.5,
                     label="Signal")
Beispiel #6
0
    pre_separation.fit(subtrain[reweight_feats],
                       subtrain[['isE']],
                       sample_weight=subtrain.weight)
    test_proba = pre_separation.predict_proba(subtest[reweight_feats])[:, 1]
    roc_pre = roc_curve(subtest[['isE']],
                        test_proba,
                        sample_weight=subtest.weight)[:2]
    auc_pre = roc_auc_score(subtest[['isE']],
                            test_proba,
                            sample_weight=subtest.weight)

    #run reweighting -- not working on MC for some reason
    reweighter = GBReweighter(n_estimators=1 if debug else 30,
                              max_depth=4,
                              learning_rate=0.1)
    reweighter.fit(subtrain[subtrain.isE == 1][reweight_feats], subtrain[
        subtrain.isE == 0][reweight_feats])  #make electrons look like tracks

    #run weights FOR EVERYTHING!
    for df in [data, subtrain, subtest]:
        weights = reweighter.predict_weights(
            df[df.isE == 1][reweight_feats])  #1/w to be used
        df.loc[df.isE == 1, 'weight'] = weights

    #save reweighter
    joblib.dump(reweighter, reweight_model_file, compress=True)

    # Check that sepratation vanishes
    post_separation = GradientBoostingClassifier(
        n_estimators=1 if debug else 50,
        max_depth=4,
        random_state=42,
str_to_parent_folder=str(Path(__file__).resolve().parent.parent)
tree = uproot.open(str_to_parent_folder+"/forward_MC/user.lehrke.mc16_13TeV.361106.Zee.EGAM8.e3601_e5984_s3126_r10724_r10726_p3648.ePID18_NTUP3_v01_myOutput.root/user.lehrke.17118381._000003.myOutput.root")[b'tree;1']
reweight_data = tree.pandas.df(["averageInteractionsPerCrossing","p_et_calo","p_eta","p_TruthType"])
reweight_data_small = reweight_data.sample(frac=1,replace=False, random_state=42)

p_TruthType_reweight_data = reweight_data_small.where(reweight_data_small['p_TruthType'] == 2)
p_TruthType_reweight_data_s_dropped = p_TruthType_reweight_data.drop(['p_TruthType'], axis=1)
p_TruthType_reweight_data_nan_s_dropped = p_TruthType_reweight_data_s_dropped.dropna(axis=0)
background_reweight_data = reweight_data_small.where(reweight_data['p_TruthType'] !=2)
background_reweight_data_s_dropped = background_reweight_data.drop(['p_TruthType'], axis=1)
background_reweight_data_nan_s_dropped =background_reweight_data_s_dropped.dropna(axis=0)

ratio=len(p_TruthType_reweight_data_nan_s_dropped)/len(background_reweight_data_nan_s_dropped)

reweighter = GBReweighter(n_estimators=40)
reweighter.fit(background_reweight_data_nan_s_dropped, p_TruthType_reweight_data_nan_s_dropped)
weights = reweighter.predict_weights(background_reweight_data_nan_s_dropped)
print(weights)

total_weights=ratio*weights /np.mean(weights)
np.savetxt('../weights/weights_MC_03.csv', total_weights, delimiter=',')
#reweighted_background = background_reweight_data.multiply(weights, axis=0)

""" fig_weight, ax_weight = plt.subplots(3,2, figsize=(15,15))


ax_weight[0,0].hist(p_TruthType_reweight_data_nan_s_dropped.p_et_calo.ravel(),bins=50,range=(0,100000), color = 'r', alpha = 0.5, label = "p_TruthType")
ax_weight[0,0].hist(background_reweight_data_nan_s_dropped.p_et_calo.ravel(), bins=50,range=(0,100000), color = 'blue', alpha = 0.5, label = "Background")
ax_weight[0,0].legend(loc="upper right")
ax_weight[0,0].set_title('P_et_calo (before weight)')
Beispiel #8
0
    n_estimators=17,
    #learning_rate=params['learning_rate'],
    max_depth=5,
    #min_samples_leaf=params['min_samples_leaf'],
    #loss_regularization=params['loss_regularization']
)

# Create weight estimators and fit them to the data
log.info(f"Fitting weights...")
reweighterEst10.fit(
    original=np.array([
        data_train['muo_eta'][trainMask & (data_train["label"] < 0.5)],
        data_train['muo_pt'][trainMask & (data_train["label"] < 0.5)],
        data_train['correctedScaledAverageMu'][trainMask
                                               & (data_train["label"] < 0.5)]
    ]).T,
    target=np.array([
        data_train['muo_eta'][trainMask & (data_train["label"] >= 0.5)],
        data_train['muo_pt'][trainMask & (data_train["label"] >= 0.5)],
        data_train['correctedScaledAverageMu'][trainMask
                                               & (data_train["label"] >= 0.5)]
    ]).T)
reweighterEst20.fit(
    original=np.array([
        data_train['muo_eta'][trainMask & (data_train["label"] < 0.5)],
        data_train['muo_pt'][trainMask & (data_train["label"] < 0.5)],
        data_train['correctedScaledAverageMu'][trainMask
                                               & (data_train["label"] < 0.5)]
    ]).T,
    target=np.array([
        data_train['muo_eta'][trainMask & (data_train["label"] >= 0.5)],
Beispiel #9
0
# Read the decay times from the LHCb simulation - I've serialised it here
print("reading pickle")
with open("mc_times.pickle", "rb") as f:
    mc_times = pickle.load(f)

# Generate some random numbers from an exponential distribution with the right decay constant
d_lifetime_ps = 0.49
N = len(mc_times)
print("gen times")
exp_times = np.random.exponential(d_lifetime_ps, N)

mc_train, mc_test, model_train, model_test = train_test_split(
    mc_times, exp_times)

bdt = GBReweighter()
print("Training bdt")
bdt.fit(original=model_train, target=mc_train)
weights = bdt.predict_weights(model_test)

kw = {"bins": np.linspace(0.0, 9.0, 100), "alpha": 0.3, "density": True}
plt.figure(figsize=(12.0, 9.0))

plt.hist(mc_test, label="Original", **kw)
plt.hist(model_test, label="Target", **kw)
plt.hist(model_test, label="Target Weighted", weights=weights, **kw)
plt.legend()

plt.xlabel("Time /ps")
plt.ylabel("Counts")
plt.savefig("mwe.png")
            print "Performing train-test split"
            original_train, original_test = train_test_split(original)
        else:
            original_train = original
            original_test = original

        original_weight_distribution_train = original_train[original_weights] if original_weights != None else None
        target_weight_distribution = target[target_weights] if target_weights != None else None


        #Start the training
        gb = GBReweighter(**gb_settings)
        print( "Doing training of GBReweighter..." )
        print( "Re-weighting variables: {}".format(reweighting_branches) )
        begin = time.time()
        gb_output = gb.fit(original_train[reweighting_branches], target[reweighting_branches],
                original_weight = original_weight_distribution_train, target_weight = target_weight_distribution)
        print( "Settings: {}".format(gb_output) )
        print( "Training of GBReweighter took {:.2f} seconds".format(time.time()-begin) )






        #Save the classifier as pickle if applicable
        if options.Save:
            ensure_dir(options.Save)
            with open(options.Save, 'wb') as handle:
                pickle_file = {"GBReweighter" : gb, "Branches" : reweighting_branches}
                pickle.dump(pickle_file, handle)
            print( "GBReweighter saved to {}".format(options.Save) )
Beispiel #11
0
                                   axis=1)

# CREATING SAMPLE WEIGHTS # (https://arogozhnikov.github.io/hep_ml/reweight.html)
res_cols = list(tm_revenue.reset_index().columns)
resampling_b = datab[['Segment Knicks'] + res_cols]
resampling_h = datah[['Segment Rangers'] + res_cols]
sampleb = resampling_b.dropna(subset=['Segment Knicks']).drop(
    ['Segment Knicks'], axis=1).set_index('email')
fullb = resampling_b[pd.isnull(resampling_b['Segment Knicks'])].drop(
    ['Segment Knicks'], axis=1).set_index('email')
sampleh = resampling_h.dropna(subset=['Segment Rangers']).drop(
    ['Segment Rangers'], axis=1).set_index('email')
fullh = resampling_h[pd.isnull(resampling_h['Segment Rangers'])].drop(
    ['Segment Rangers'], axis=1).set_index('email')
reweighter = GBReweighter()
sampleb['weight'] = reweighter.fit(
    original=sampleb, target=fullb).predict_weights(sampleb).round(3)
sampleh['weight'] = reweighter.fit(
    original=sampleh, target=fullh).predict_weights(sampleh).round(3)

# LOGIT MODELING #
modeling_bball = pd.merge(data[data['vspt'] == 'basketball'],
                          sampleb['weight'].reset_index(),
                          on='email').drop(
                              ['vspt', 'Sample', 'email', 'Segment Rangers'],
                              axis=1).set_index('uid')
modeling_hockey = pd.merge(data[data['vspt'] == 'hockey'],
                           sampleh['weight'].reset_index(),
                           on='email').drop(
                               ['vspt', 'Sample', 'email', 'Segment Knicks'],
                               axis=1).set_index('uid')
Beispiel #12
0
            n_estimators=params['n_estimators'],
            learning_rate=params['learning_rate'],
            max_depth=params['max_depth'],
            min_samples_leaf=params['min_samples_leaf'],
            loss_regularization=params['loss_regularization'])

        # Create weight estimators and fit them to the data
        log.info(f"Fitting weights...")
        reweighter.fit(
            original=np.array([
                data_train['eta'][trainMask & (data_train["label"] < 0.5)],
                data_train['pt'][trainMask & (data_train["label"] < 0.5)],
                data_train['invM'][trainMask & (data_train["label"] < 0.5)],
                data_train['correctedScaledActualMu'][
                    trainMask & (data_train["label"] < 0.5)]
            ]).T,
            target=np.array([
                data_train['eta'][trainMask & (data_train["label"] >= 0.5)],
                data_train['pt'][trainMask & (data_train["label"] >= 0.5)],
                data_train['invM'][trainMask & (data_train["label"] >= 0.5)],
                data_train['correctedScaledActualMu'][
                    trainMask & (data_train["label"] >= 0.5)]
            ]).T)
        log.info(
            f"Fitting of weights is done (time: {timedelta(seconds=time() - t)})"
        )

        # Get weights
        log.info(f"Get weights for training, validation and test set")
        weight_train = getRegularWeights("train", reweighter,
                                         data_train[trainMask])