コード例 #1
0
def gbreweight(weighttree, originaltree, name, variables, n=None):
    '''Use Hep_ml GBReweighter to calculate weights for weighttree to match originaltree in the given variables.
    Adds a friend with branch named 'name' of length 2: the first element is the calculated weight, the second is
    the product of that weight with any existing weight used for the weighttree (from the selection).'''
    from hep_ml.reweight import GBReweighter

    originalweights, originalvals = get_weights_and_vals(
        originaltree, variables, n)
    weightedweights, weightedvals = get_weights_and_vals(
        weighttree, variables, n)

    weighter = GBReweighter()
    print('Fit GBReweighter', name)
    weighter.fit(original=originalvals,
                 original_weight=originalweights,
                 target=weightedvals,
                 target_weight=weightedweights)
    weight = weighttree.selection_functor()
    vals = weighttree.get_functor_list(variables)

    def get_weight():
        _w = weighter.predict_weights([vals()])[0]
        return [_w, _w * weight()]

    print('Add weights for GBReweighter', name)
    weighttree.add_friend_tree(name,
                               {name: dict(function=get_weight, length=2)})
コード例 #2
0
ファイル: MagicRWSample.py プロジェクト: cvilelahep/MagicRW
    def trainBDT(self, targetSample):

        originDF = self.getTrainDataFrame()
        targetDF = targetSample.getTrainDataFrame()

        if self.observables.keys() != targetSample.observables.keys():
            print 'Error observables for target and origin data sets do not match. Exiting...'
            print 'Origin:', self.observables.keys()
            print 'Target:', targetSample.observables.keys()
            exit(-1)

        originPreWeights = originDF["preweight"]
        targetPreWeights = targetDF["preweight"]

        originDF = originDF[self.observables.keys()]
        targetDF = targetDF[targetSample.observables.keys()]

        reweighter = GBReweighter(n_estimators=200,
                                  learning_rate=.1,
                                  max_depth=3,
                                  min_samples_leaf=1000,
                                  loss_regularization=1.0)

        reweighter.fit(original=originDF,
                       target=targetDF,
                       original_weight=originPreWeights,
                       target_weight=targetPreWeights)

        with open(self.gbrwPath(), "wb") as f:
            pickle.dump(reweighter, f)
コード例 #3
0
ファイル: efficiencies.py プロジェクト: jcob95/D0ToK3pi
def train_reweighter():
    extra_vars = [
        gcm().ltime_var
    ]
    all_vars = gcm().phsp_vars + extra_vars
    columns = [v.var for v in all_vars if 'phi' not in v.var]
    columns += ['cosphi', 'sinphi']

    # Current mode stuff
    data = gcm().get_data([f.var for f in extra_vars])
    add_variables.append_phsp(data)
    data['cosphi'] = np.cos(data.phi1)
    data['sinphi'] = np.sin(data.phi1)
    df_sel = final_selection.get_final_selection()
    df_sel &= selection.delta_mass_signal_region()

    gen = get_model()
    gen['cosphi'] = np.cos(gen.phi1)
    gen['sinphi'] = np.sin(gen.phi1)

    limits = {v.var: v.binning[1:] for v in all_vars}
    limits['cosphi'] = (-1., 1)
    limits['sinphi'] = (-1., 1)
    for c in columns:
        mi, ma = limits[c]
        data[c] = (data[c] - mi) / (ma - mi) + 2.
        gen[c] = (gen[c] - mi) / (ma - mi) + 2.

    log.info('Training BDT reweighter for {}'.format(', '.join(columns)))
    reweighter = GBReweighter(n_estimators=300, max_depth=5, learning_rate=0.2)

    reweighter.fit(original=gen[columns].sample(n=250000),
                   target=data[columns][df_sel].sample(n=250000))
    bdt_utils.dump_reweighter(reweighter)
コード例 #4
0
ファイル: test_reweight.py プロジェクト: stjordanis/hep_ml
def test_folding_gb_reweighter():
    reweighter = FoldingReweighter(GBReweighter(n_estimators=20,
                                                max_depth=2,
                                                learning_rate=0.1),
                                   n_folds=3)
    check_reweighter(n_dimensions=2,
                     n_samples=200000,
                     reweighter=reweighter,
                     folding=True)
コード例 #5
0
log.info(f"                Background:      {trainNBkg} ({( (trainNBkg) / (trainNSig+trainNBkg) )*100:.2f}%)")
log.info(f"        Shape of validation set: {np.shape(data_train[validMask])}")
log.info(f"                Signal:          {validNSig} ({( (validNSig) / (validNSig+validNBkg) )*100:.2f}%)")
log.info(f"                Background:      {validNBkg} ({( (validNBkg) / (validNSig+validNBkg) )*100:.2f}%)")


#============================================================================
# Reweigh
#============================================================================
log.info(f"Reweigh background data using GBReweighter on training set")
t = time()
# Create weight estimators and fit them to the data

reweighterEst10  = GBReweighter(n_estimators=10,
                           #learning_rate=params['learning_rate'],
                           max_depth=5,
                           #min_samples_leaf=params['min_samples_leaf'],
                           #loss_regularization=params['loss_regularization']
                           )
reweighterEst20  = GBReweighter(n_estimators=20,
                           #learning_rate=params['learning_rate'],
                           max_depth=5,
                           #min_samples_leaf=params['min_samples_leaf'],
                           #loss_regularization=params['loss_regularization']
                           )
reweighterEst40  = GBReweighter(n_estimators=40,
                           #learning_rate=params['learning_rate'],
                           max_depth=5,
                           #min_samples_leaf=params['min_samples_leaf'],
                           #loss_regularization=params['loss_regularization']
                           )
log.info(f"Fitting weights...")
コード例 #6
0
def test_gb_reweighter_2d_new():
    reweighter = GBReweighter(max_depth=3,
                              n_estimators=30,
                              learning_rate=0.3,
                              gb_args=dict(subsample=0.3))
    check_reweighter(n_dimensions=2, n_samples=200000, reweighter=reweighter)
コード例 #7
0
def test_gb_reweighter_1d():
    reweighter = GBReweighter(n_estimators=100, max_depth=2)
    check_reweighter(n_dimensions=1, n_samples=100000, reweighter=reweighter)
コード例 #8
0
signal_reweight_data = reweight_data_small.where(reweight_data['Signal'] == 1)
signal_reweight_data_s_dropped = signal_reweight_data.drop(['Signal'], axis=1)
signal_reweight_data_nan_s_dropped = signal_reweight_data_s_dropped.dropna(
    axis=0)
background_reweight_data = reweight_data_small.where(
    reweight_data['Signal'] == 0)
background_reweight_data_s_dropped = background_reweight_data.drop(['Signal'],
                                                                   axis=1)
background_reweight_data_nan_s_dropped = background_reweight_data_s_dropped.dropna(
    axis=0)

ratio = len(signal_reweight_data_nan_s_dropped) / len(
    background_reweight_data_nan_s_dropped)

reweighter = GBReweighter(n_estimators=40)
reweighter.fit(background_reweight_data_nan_s_dropped,
               signal_reweight_data_nan_s_dropped)
weights = reweighter.predict_weights(background_reweight_data_nan_s_dropped)
print(weights)

total_weights = ratio * weights / np.mean(weights)

#reweighted_background = background_reweight_data.multiply(weights, axis=0)

fig_weight, ax_weight = plt.subplots(3, 2, figsize=(15, 15))

ax_weight[0, 0].hist(signal_reweight_data_nan_s_dropped.p_et_calo.ravel(),
                     bins=50,
                     range=(0, 100000),
                     color='r',
コード例 #9
0
        verbose=1)
    #set_trace()
    pre_separation.fit(subtrain[reweight_feats],
                       subtrain[['isE']],
                       sample_weight=subtrain.weight)
    test_proba = pre_separation.predict_proba(subtest[reweight_feats])[:, 1]
    roc_pre = roc_curve(subtest[['isE']],
                        test_proba,
                        sample_weight=subtest.weight)[:2]
    auc_pre = roc_auc_score(subtest[['isE']],
                            test_proba,
                            sample_weight=subtest.weight)

    #run reweighting -- not working on MC for some reason
    reweighter = GBReweighter(n_estimators=1 if debug else 30,
                              max_depth=4,
                              learning_rate=0.1)
    reweighter.fit(subtrain[subtrain.isE == 1][reweight_feats], subtrain[
        subtrain.isE == 0][reweight_feats])  #make electrons look like tracks

    #run weights FOR EVERYTHING!
    for df in [data, subtrain, subtest]:
        weights = reweighter.predict_weights(
            df[df.isE == 1][reweight_feats])  #1/w to be used
        df.loc[df.isE == 1, 'weight'] = weights

    #save reweighter
    joblib.dump(reweighter, reweight_model_file, compress=True)

    # Check that sepratation vanishes
    post_separation = GradientBoostingClassifier(
コード例 #10
0
# Read the decay times from the LHCb simulation - I've serialised it here
print("reading pickle")
with open("mc_times.pickle", "rb") as f:
    mc_times = pickle.load(f)

# Generate some random numbers from an exponential distribution with the right decay constant
d_lifetime_ps = 0.49
N = len(mc_times)
print("gen times")
exp_times = np.random.exponential(d_lifetime_ps, N)

mc_train, mc_test, model_train, model_test = train_test_split(
    mc_times, exp_times)

bdt = GBReweighter()
print("Training bdt")
bdt.fit(original=model_train, target=mc_train)
weights = bdt.predict_weights(model_test)

kw = {"bins": np.linspace(0.0, 9.0, 100), "alpha": 0.3, "density": True}
plt.figure(figsize=(12.0, 9.0))

plt.hist(mc_test, label="Original", **kw)
plt.hist(model_test, label="Target", **kw)
plt.hist(model_test, label="Target Weighted", weights=weights, **kw)
plt.legend()

plt.xlabel("Time /ps")
plt.ylabel("Counts")
plt.savefig("mwe.png")
コード例 #11
0
        target = read_root(options.Rootfiles[1], columns=all_branches_target)

        #Split original distribution if -t flag is given
        if options.TrainTest:
            print "Performing train-test split"
            original_train, original_test = train_test_split(original)
        else:
            original_train = original
            original_test = original

        original_weight_distribution_train = original_train[original_weights] if original_weights != None else None
        target_weight_distribution = target[target_weights] if target_weights != None else None


        #Start the training
        gb = GBReweighter(**gb_settings)
        print( "Doing training of GBReweighter..." )
        print( "Re-weighting variables: {}".format(reweighting_branches) )
        begin = time.time()
        gb_output = gb.fit(original_train[reweighting_branches], target[reweighting_branches],
                original_weight = original_weight_distribution_train, target_weight = target_weight_distribution)
        print( "Settings: {}".format(gb_output) )
        print( "Training of GBReweighter took {:.2f} seconds".format(time.time()-begin) )






        #Save the classifier as pickle if applicable
        if options.Save:
コード例 #12
0
                 how='right').drop(['vspt', 'uid', 'Sample', 'Segment Knicks'],
                                   axis=1)

# CREATING SAMPLE WEIGHTS # (https://arogozhnikov.github.io/hep_ml/reweight.html)
res_cols = list(tm_revenue.reset_index().columns)
resampling_b = datab[['Segment Knicks'] + res_cols]
resampling_h = datah[['Segment Rangers'] + res_cols]
sampleb = resampling_b.dropna(subset=['Segment Knicks']).drop(
    ['Segment Knicks'], axis=1).set_index('email')
fullb = resampling_b[pd.isnull(resampling_b['Segment Knicks'])].drop(
    ['Segment Knicks'], axis=1).set_index('email')
sampleh = resampling_h.dropna(subset=['Segment Rangers']).drop(
    ['Segment Rangers'], axis=1).set_index('email')
fullh = resampling_h[pd.isnull(resampling_h['Segment Rangers'])].drop(
    ['Segment Rangers'], axis=1).set_index('email')
reweighter = GBReweighter()
sampleb['weight'] = reweighter.fit(
    original=sampleb, target=fullb).predict_weights(sampleb).round(3)
sampleh['weight'] = reweighter.fit(
    original=sampleh, target=fullh).predict_weights(sampleh).round(3)

# LOGIT MODELING #
modeling_bball = pd.merge(data[data['vspt'] == 'basketball'],
                          sampleb['weight'].reset_index(),
                          on='email').drop(
                              ['vspt', 'Sample', 'email', 'Segment Rangers'],
                              axis=1).set_index('uid')
modeling_hockey = pd.merge(data[data['vspt'] == 'hockey'],
                           sampleh['weight'].reset_index(),
                           on='email').drop(
                               ['vspt', 'Sample', 'email', 'Segment Knicks'],
コード例 #13
0
        'loss_regularization': 5.0
    }]

    log.info(f"Regular reweights")
    for iWeight, weightName in enumerate(reweightNames):
        t = time()
        # Print parameters
        log.info(f"Parameters for GBReweighter:")
        params = reweightParams[iWeight]
        for param in params:
            log.info(f"        {param} : {params[param]}")

        # Setup reweighter: https://arogozhnikov.github.io/hep_ml/reweight.html#
        reweighter = GBReweighter(
            n_estimators=params['n_estimators'],
            learning_rate=params['learning_rate'],
            max_depth=params['max_depth'],
            min_samples_leaf=params['min_samples_leaf'],
            loss_regularization=params['loss_regularization'])

        # Create weight estimators and fit them to the data
        log.info(f"Fitting weights...")
        reweighter.fit(
            original=np.array([
                data_train['eta'][trainMask & (data_train["label"] < 0.5)],
                data_train['pt'][trainMask & (data_train["label"] < 0.5)],
                data_train['invM'][trainMask & (data_train["label"] < 0.5)],
                data_train['correctedScaledActualMu'][
                    trainMask & (data_train["label"] < 0.5)]
            ]).T,
            target=np.array([
                data_train['eta'][trainMask & (data_train["label"] >= 0.5)],