Beispiel #1
0
def test_gradient_boosting(n_samples=1000):
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_variables = ['column0']
    n_estimators = 20

    loss1 = SimpleKnnLossFunction(uniform_variables)
    # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10)
    loss3 = BinomialDevianceLossFunction()
    # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3)
    # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r))
    loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7bin = BinFlatnessLossFunction(uniform_variables,
                                       ada_coefficient=0.5,
                                       uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5)
    loss7knn = KnnFlatnessLossFunction(uniform_variables,
                                       ada_coefficient=0.5,
                                       uniform_label=[0, 1])
    # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1)
    # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1])

    for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]:
        result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                             subsample=0.7, n_estimators=n_estimators, train_variables=None) \
            .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY)
        assert result >= 0.7, "The quality is too poor: %.3f" % result
Beispiel #2
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = CompositeLossFunction()
    loss4 = KnnAdaLossFunction(uniform_features=uniform_features,
                               uniform_label=1)
    loss5 = KnnAdaLossFunction(uniform_features=uniform_features,
                               uniform_label=[0, 1])
    loss6bin = BinFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=0)
    loss7bin = BinFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=1)
    loss7knn = KnnFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)
Beispiel #3
0
def flatnessloss(X, y, test):

    features = list(X.columns)
    features.remove('mass')
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
    clf = UGradientBoostingClassifier(loss=loss,
                                      n_estimators=300,
                                      subsample=0.7,
                                      max_depth=9,
                                      min_samples_leaf=8,
                                      learning_rate=0.1,
                                      train_features=features,
                                      random_state=11)

    arr = np.random.permutation(X.shape[0])
    X = X.ix[arr, ]
    y = y[arr]

    skf = cross_validation.StratifiedKFold(y, n_folds=7)
    blend_train = np.zeros(X.shape[0])
    prediction = []
    blend_test_j = np.zeros((test.shape[0], len(skf)))

    for i, (train_index, cv_index) in enumerate(skf):
        print "Fold:", i
        X_train = X.ix[train_index, ]
        y_train = y[train_index]
        X_cv = X.ix[cv_index, ]
        #y_cv = y[cv_index]
        clf.fit(X_train, y_train)

        blend_train[cv_index] = clf.predict_proba(X_cv)[:, 1]
        blend_test_j[:, i] = clf.predict_proba(test)[:, 1]
    prediction = blend_test_j.mean(1)

    return prediction
Beispiel #4
0
for i in range(100):
    print 'shuffling'
    train = shuffle(train)
    print("Train a Random Forest model")

    rf1 = RandomForestClassifier(n_estimators=500,
                                 n_jobs=-1,
                                 criterion="entropy",
                                 max_depth=10,
                                 max_features=6,
                                 min_samples_leaf=2)

    rf1.fit(train[features], train["signal"])
    print("Train a UGradientBoostingClassifier")
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)

    rf = UGradientBoostingClassifier(loss=loss,
                                     n_estimators=200,
                                     max_depth=6,
                                     learning_rate=0.15,
                                     train_features=features,
                                     subsample=0.7,
                                     random_state=369)
    rf.fit(train[features + ['mass']], train['signal'])

    print("Train a XGBoost model")
    params = {
        "objective": "binary:logistic",
        "learning_rate": 0.2,
        "max_depth": 6,
Beispiel #5
0
]

features = list(f for f in df.columns if f not in features_out)

print("Split train/test")
train, test = train_test_split(df, test_size=0.33)

X_train = train[features]
y_train = train['signal']

X_val = test[features]
y_val = test['signal']

loss = BinFlatnessLossFunction(['mass'],
                               n_bins=15,
                               uniform_label=0,
                               fl_coefficient=15,
                               power=2)
ugbc = UGradientBoostingClassifier(loss=loss,
                                   n_estimators=550,
                                   max_depth=6,
                                   learning_rate=0.15,
                                   train_features=features,
                                   subsample=0.7,
                                   random_state=123)
ugbc.fit(train[features + ['mass']], train['signal'])
pred_raw = ugbc.predict(test[features])
#print(pred_raw)
pred = pd.DataFrame(data={'signal': pred_raw})
#print(pred.head(5))
#accuracy_fn(pred,y_val)
def Model1():

    # Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers
    # which are trained using the stacked data

    model = 1  # set the model number for feature engineering
    n_folds = 3  # set the number of folders for generating meta-features
    n_stack = 15  # number of models used for stacking

    train, test, features = utils.LoadData(
        model)  # load data and obtain the list of features for estimation

    # Initialize models for stacking

    clf1 = KNeighborsClassifier(n_neighbors=5,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf2 = KNeighborsClassifier(n_neighbors=10,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf3 = KNeighborsClassifier(n_neighbors=20,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf4 = KNeighborsClassifier(n_neighbors=40,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf5 = KNeighborsClassifier(n_neighbors=80,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf6 = KNeighborsClassifier(n_neighbors=160,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf7 = KNeighborsClassifier(n_neighbors=320,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf8 = LogisticRegression(penalty='l2',
                              dual=False,
                              tol=0.0001,
                              C=5.0,
                              fit_intercept=True,
                              intercept_scaling=1,
                              class_weight=None,
                              random_state=101,
                              solver='lbfgs',
                              max_iter=200,
                              multi_class='ovr',
                              verbose=0)

    clf9 = GaussianNB()

    clf10 = SVC(C=5.0,
                kernel='rbf',
                degree=3,
                gamma=0.0,
                coef0=0.008,
                shrinking=True,
                probability=True,
                tol=0.001,
                cache_size=200,
                class_weight=None,
                verbose=False,
                max_iter=-1,
                random_state=101)

    clf11 = RandomForestClassifier(n_estimators=250,
                                   criterion='gini',
                                   max_depth=6,
                                   min_samples_split=2,
                                   min_samples_leaf=5,
                                   min_weight_fraction_leaf=0.0,
                                   max_features=0.7,
                                   max_leaf_nodes=None,
                                   bootstrap=False,
                                   oob_score=False,
                                   n_jobs=2,
                                   random_state=101,
                                   verbose=0,
                                   warm_start=False,
                                   class_weight=None)

    clf12 = ExtraTreesClassifier(n_estimators=250,
                                 criterion='gini',
                                 max_depth=6,
                                 min_samples_split=2,
                                 min_samples_leaf=5,
                                 min_weight_fraction_leaf=0.0,
                                 max_features=0.7,
                                 max_leaf_nodes=None,
                                 bootstrap=False,
                                 oob_score=False,
                                 n_jobs=2,
                                 random_state=101,
                                 verbose=0,
                                 warm_start=False,
                                 class_weight=None)

    clf13 = GradientBoostingClassifier(loss='deviance',
                                       learning_rate=0.2,
                                       n_estimators=450,
                                       subsample=0.7,
                                       min_samples_split=2,
                                       min_samples_leaf=5,
                                       min_weight_fraction_leaf=0.0,
                                       max_depth=6,
                                       init=None,
                                       random_state=101,
                                       max_features=None,
                                       verbose=0,
                                       max_leaf_nodes=None,
                                       warm_start=False)

    clf14 = SGDClassifier(loss='log',
                          penalty='l2',
                          alpha=0.0001,
                          l1_ratio=0.15,
                          fit_intercept=True,
                          n_iter=10,
                          shuffle=True,
                          verbose=0,
                          epsilon=0.1,
                          n_jobs=2,
                          random_state=101,
                          learning_rate='optimal',
                          eta0=0.0,
                          power_t=0.5,
                          class_weight=None,
                          warm_start=False,
                          average=False)

    clf15 = models.XGBoostClassifier(nthread=2,
                                     eta=.2,
                                     gamma=0,
                                     max_depth=6,
                                     min_child_weight=3,
                                     max_delta_step=0,
                                     subsample=0.7,
                                     colsample_bytree=0.7,
                                     silent=1,
                                     seed=101,
                                     l2_reg=1,
                                     l1_reg=0,
                                     n_estimators=450)

    clfs = [
        clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11,
        clf12, clf13, clf14, clf15
    ]

    # Construct stacked datasets
    train_blend, test_blend, train_probs, test_probs = utils.StackModels(
        train[features], test[features], train.signal.values, clfs, n_folds)

    # Construct data for uniform boosting
    columns = ['p%s ' % (i) for i in range(0, n_stack)]
    meta_train = pd.DataFrame(
        {columns[i]: train_probs[:, i]
         for i in range(0, n_stack)})
    meta_test = pd.DataFrame(
        {columns[i]: test_probs[:, i]
         for i in range(0, n_stack)})
    train_ugb = pd.concat([train, meta_train], axis=1)
    test_ugb = pd.concat([test, meta_test], axis=1)
    features_ugb = features + columns  # features used for UGB training (original features + meta-features)

    # Initialize models for ensemble
    loss = BinFlatnessLossFunction(['mass'],
                                   n_bins=20,
                                   power=1,
                                   fl_coefficient=3,
                                   uniform_label=0)

    clf_ugb = UGradientBoostingClassifier(loss=loss,
                                          n_estimators=275,
                                          max_depth=11,
                                          min_samples_leaf=3,
                                          learning_rate=0.03,
                                          train_features=features_ugb,
                                          subsample=0.85,
                                          random_state=101)

    clf_xgb = models.XGBoostClassifier(nthread=6,
                                       eta=.0225,
                                       gamma=1.225,
                                       max_depth=11,
                                       min_child_weight=10,
                                       max_delta_step=0,
                                       subsample=0.8,
                                       colsample_bytree=0.3,
                                       silent=1,
                                       seed=101,
                                       l2_reg=1,
                                       l1_reg=0,
                                       n_estimators=1100)

    clf_rf = RandomForestClassifier(n_estimators=375,
                                    criterion='gini',
                                    max_depth=10,
                                    min_samples_split=6,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0,
                                    max_features=0.6,
                                    max_leaf_nodes=None,
                                    bootstrap=True,
                                    oob_score=False,
                                    n_jobs=4,
                                    random_state=101,
                                    verbose=0,
                                    warm_start=False,
                                    class_weight=None)

    # Train models
    print("Training a Uniform Gradient Boosting model")
    clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal'])
    preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:, 1]

    print("Training a XGBoost model")
    clf_xgb.fit(train_blend, train['signal'])
    preds_xgb = clf_xgb.predict_proba(test_blend)

    print("Training a Random Forest model")
    clf_rf.fit(train_blend, train['signal'])
    preds_rf = clf_rf.predict_proba(test_blend)[:, 1]

    # Compute ensemble predictions
    preds = 0.3 * (preds_xgb**(0.65)) * (preds_rf**(0.35)) + 0.7 * preds_ugb

    return preds
def stacked_models(train, features, test, in_sample=True):
    """
    Build stacked generalization models, set in_sample to False
    to predict on test set.
    """

    if in_sample:

        np.random.seed(1)
        new_indices = np.asarray(train.index.copy())
        np.random.shuffle(new_indices)

        train = train.iloc[new_indices].reset_index(drop=True).copy()

        # not used in CV testing..
        del test

        cutoff = int(new_indices.shape[0] * 0.75)

        X_dev = train[:cutoff].reset_index(drop=True).copy()
        Y_dev = train[:cutoff]['signal'].reset_index(drop=True).copy()

        X_test = train[cutoff:][
            train[cutoff:]['min_ANNmuon'] > 0.4].reset_index(drop=True).copy()
        Y_test = train[cutoff:][
            train[cutoff:]['min_ANNmuon'] > 0.4]['signal'].reset_index(
                drop=True).copy()

    else:
        np.random.seed(1)
        new_indices = np.asarray(train.index.copy())
        np.random.shuffle(new_indices)

        train = train.iloc[new_indices].reset_index(drop=True).copy()

        X_dev = train.reset_index(drop=True).copy()
        Y_dev = train['signal'].reset_index(drop=True).copy()

        X_test = test.reset_index(drop=True).copy()
        Y_test = None

    n_folds = 5

    # put ur parameter tuned CLFs in this list.

    clfs = [
        RandomForestClassifier(n_estimators=200,
                               criterion='entropy',
                               random_state=20,
                               n_jobs=-1),
        RandomForestClassifier(n_estimators=200,
                               criterion='entropy',
                               random_state=20,
                               n_jobs=-1,
                               max_depth=6),
        ExtraTreesClassifier(n_estimators=200,
                             criterion='entropy',
                             random_state=50,
                             n_jobs=-1),
        ExtraTreesClassifier(n_estimators=200,
                             criterion='entropy',
                             random_state=50,
                             n_jobs=-1,
                             max_depth=6),
        Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]),
        UGradientBoostingClassifier(loss=BinFlatnessLossFunction(
            ['mass'], n_bins=15, uniform_label=0),
                                    n_estimators=150,
                                    subsample=0.1,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(
            ['mass'], n_neighbours=30, uniform_label=0),
                                    n_estimators=150,
                                    subsample=0.1,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=BinFlatnessLossFunction(
            ['mass'], n_bins=15, uniform_label=0),
                                    n_estimators=100,
                                    subsample=0.8,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(
            ['mass'], n_neighbours=30, uniform_label=0),
                                    n_estimators=100,
                                    subsample=0.8,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        XGBoostClassifier(eval_metric='auc',
                          objective='binary:logistic',
                          num_class=2,
                          nthread=4,
                          silent=1,
                          colsample_bytree=0.6,
                          eta=0.005,
                          max_depth=6,
                          min_child_weight=13,
                          seed=1337,
                          subsample=0.7),
        NN1(len(features)),
        NN2(len(features)),
        NN3(len(features)),
        NN4(len(features))
    ]

    skf = list(StratifiedKFold(Y_dev, n_folds))

    # Number of training data x Number of classifiers
    blend_train = np.zeros((X_dev.shape[0], len(clfs)))
    # Number of testing data x Number of classifiers
    blend_test = np.zeros((X_test.shape[0], len(clfs)))

    print 'X_test.shape = %s' % (str(X_test.shape))
    print 'blend_train.shape = %s' % (str(blend_train.shape))
    print 'blend_test.shape = %s' % (str(blend_test.shape))

    # For each classifier, we train the number of fold times (=len(skf))
    for j, clf in enumerate(clfs):
        print 'Training classifier [%s]' % (j)
        # Number of testing data x Number of folds , we will take the mean of
        # the predictions later
        blend_test_j = np.zeros((X_test.shape[0], len(skf)))
        for i, (train_index, cv_index) in enumerate(skf):
            print 'Fold [%s]' % (i)

            # This is the training and validation set
            X_train = X_dev.iloc[train_index].copy()
            Y_train = Y_dev.iloc[train_index].copy()
            X_cv = X_dev.iloc[cv_index].copy()
            Y_cv = Y_dev.iloc[cv_index].copy()

            # handle the case of hep.ml stuff
            if type(clf) == type(UGradientBoostingClassifier()):
                clf.fit(X_train[features + ['mass']],
                        Y_train.values.astype(np.int32))
            else:
                clf.fit(X_train[features], Y_train.values.astype(np.int32))

            # This output will be the basis for our blended classifier to train against,
            # which is also the output of our classifiers
            blend_train[cv_index, j] = clf.predict_proba(X_cv[features])[:, 1]
            blend_test_j[:, i] = clf.predict_proba(X_test[features])[:, 1]
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)

    print 'Y_dev.shape = %s' % (Y_dev.shape)

    # blend with LR...
    bclf = LogisticRegression()
    bclf.fit(blend_train, Y_dev)

    bclf2 = GradientBoostingClassifier(n_estimators=150,
                                       learning_rate=0.02,
                                       max_depth=4,
                                       subsample=0.9,
                                       verbose=3,
                                       random_state=1337)
    bclf2.fit(blend_train, Y_dev)

    bclf3 = NeuralNet(
        layers=[('input', layers.InputLayer), ('hidden', layers.DenseLayer),
                ('output', layers.DenseLayer)],

        # layer parameters:
        input_shape=(None, blend_train.shape[1]),
        hidden_num_units=blend_train.shape[1],
        output_nonlinearity=nonlinearities.
        softmax,  # output layer uses identity function
        output_num_units=2,  # 2 target values

        # optimization method:
        update=nesterov_momentum,
        update_learning_rate=0.01,
        update_momentum=0.9,
        regression=
        False,  # flag to indicate we're dealing with regression problem
        max_epochs=53,  # TRY 50 and 46 epochs!
        verbose=1,
        eval_size=0.10)

    bclf3.fit(blend_train.astype(np.float32), Y_dev.astype(np.int32))

    bclf4 = AdaBoostClassifier(n_estimators=400, random_state=88)
    bclf4.fit(blend_train, Y_dev)

    # Predict now
    Y_test_predict = bclf.predict_proba(blend_test)[:, 1]
    Y_test_predict2 = bclf2.predict_proba(blend_test)[:, 1]
    Y_test_predict3 = bclf3.predict_proba(blend_test.astype(np.float32))[:, 1]
    Y_test_predict4 = bclf4.predict_proba(blend_test)[:, 1]

    print 'Logit Coefs:', bclf.coef_
    if in_sample:
        score = evaluation.roc_auc_truncated(Y_test, Y_test_predict)
        score2 = evaluation.roc_auc_truncated(Y_test, Y_test_predict2)
        score3 = evaluation.roc_auc_truncated(Y_test, blend_test.mean(1))
        score4 = evaluation.roc_auc_truncated(
            Y_test, scipy_opt(blend_train, Y_dev, blend_test))
        score5 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2) / 2.0)
        score6 = evaluation.roc_auc_truncated(Y_test, Y_test_predict3)
        score7 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3) / 3.0)
        score8 = evaluation.roc_auc_truncated(Y_test, Y_test_predict4)
        score9 = evaluation.roc_auc_truncated(
            Y_test,
            (Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 3.0)
        score10 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3 +
                     Y_test_predict4) / 4.0)

        print 'LR Score = %s' % (score)
        print 'GB Score = %s' % (score2)
        print 'MEAN Score = %s' % (score3)
        print 'Scipy Score = %s' % (score4)
        print 'LR + GB score = %s' % (score5)
        print 'ANN Score= %s' % (score6)
        print 'LR + GB + ANN Score = %s' % (score7)
        print 'ADA Score = %s' % (score8)
        print 'GB + ANN + ADA Score = %s' % (score9)
        print 'LR + GB + ANN + ADA Score = %s' % (score10)
        return blend_train, Y_dev, blend_test, Y_test

    # average of ADA, ANN and GBM.
    return (Y_test_predict + Y_test_predict2 + Y_test_predict3 +
            Y_test_predict4) / 4.0