Ejemplo n.º 1
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = CompositeLossFunction()
    loss4 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1)
    loss5 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1])
    loss6bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0)
    loss7bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1)
    loss7knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1])

    for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)
Ejemplo n.º 2
0
def flatnessloss(X,y,test):
    
    features = list(X.columns)
    features.remove('mass')
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
    clf = UGradientBoostingClassifier(loss=loss, n_estimators=300, subsample=0.7, 
                                  max_depth=9, min_samples_leaf=8,
                                  learning_rate=0.1, train_features=features, random_state=11)

    
    
    arr = np.random.permutation(X.shape[0])    
    X = X.ix[arr,]
    y = y[arr]
    
    
    skf = cross_validation.StratifiedKFold(y,n_folds = 7)
    blend_train = np.zeros(X.shape[0])
    prediction = []
    blend_test_j = np.zeros((test.shape[0], len(skf)))
    
    for i,(train_index,cv_index) in enumerate(skf):
            print "Fold:",i
            X_train = X.ix[train_index,]
            y_train = y[train_index]
            X_cv = X.ix[cv_index,]
            #y_cv = y[cv_index]
            clf.fit(X_train,y_train)
            
            blend_train[cv_index] = clf.predict_proba(X_cv)[:,1]
            blend_test_j[:,i] = clf.predict_proba(test)[:,1]
    prediction = blend_test_j.mean(1)
        
    return prediction
Ejemplo n.º 3
0
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
Ejemplo n.º 4
0
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
Ejemplo n.º 5
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [
            losses.MSELossFunction(),
            losses.MAELossFunction(),
            losses.RankBoostLossFunction(request_column='fake_request')
    ]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss,
                                         max_depth=3,
                                         n_estimators=50,
                                         learning_rate=0.01,
                                         subsample=0.5,
                                         train_features=list(
                                             trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(
            roc_auc, loss)
Ejemplo n.º 6
0
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing with two main classification losses.
    Also testing copying
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    for loss in [LogLossFunction(), AdaLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss,
                                          min_samples_split=20,
                                          max_depth=5,
                                          learning_rate=.2,
                                          subsample=0.7,
                                          n_estimators=10,
                                          train_features=None)
        clf.fit(trainX, trainY)
        assert clf.n_features == n_features
        assert len(clf.feature_importances_) == n_features
        # checking that predict proba works
        for p in clf.staged_predict_proba(testX):
            assert p.shape == (n_samples, 2)
        assert numpy.all(p == clf.predict_proba(testX))
        assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low'
        # checking clonability
        _ = clone(clf)
        clf_copy = copy.deepcopy(clf)
        assert numpy.all(
            clf.predict_proba(trainX) == clf_copy.predict_proba(
                trainX)), 'copied classifier is different'
Ejemplo n.º 7
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = CompositeLossFunction()
    loss4 = KnnAdaLossFunction(uniform_features=uniform_features,
                               uniform_label=1)
    loss5 = KnnAdaLossFunction(uniform_features=uniform_features,
                               uniform_label=[0, 1])
    loss6bin = BinFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=0)
    loss7bin = BinFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=1)
    loss7knn = KnnFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)
Ejemplo n.º 8
0
def test_gradient_boosting(n_samples=1000, distance = 0.6):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, knn=5, uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, knn=5, uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=1., uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=1., uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=1., uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=1., uniform_label=[0, 1])

    for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None)
        clf.fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [losses.MSELossFunction(),
                 losses.MAELossFunction(),
                 losses.RankBoostLossFunction(request_column='fake_request')]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5,
                                         train_features=list(trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(roc_auc, loss)
Ejemplo n.º 9
0
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6):
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    for loss in [LogLossFunction(), AdaLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY)
        assert clf.n_features == n_features
        assert len(clf.feature_importances_) == n_features
        # checking that predict proba works
        for p in clf.staged_predict_proba(testX):
            assert p.shape == (n_samples, 2)
        assert numpy.all(p == clf.predict_proba(testX))
        assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low'
        # checking clonability
        _ = clone(clf)
        clf_copy = copy.deepcopy(clf)
        assert (clf.predict_proba(trainX) == clf_copy.predict_proba(trainX)).all(), 'copied classifier is different'
Ejemplo n.º 10
0
def flatnessloss(X, y, test):

    features = list(X.columns)
    features.remove('mass')
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
    clf = UGradientBoostingClassifier(loss=loss,
                                      n_estimators=300,
                                      subsample=0.7,
                                      max_depth=9,
                                      min_samples_leaf=8,
                                      learning_rate=0.1,
                                      train_features=features,
                                      random_state=11)

    arr = np.random.permutation(X.shape[0])
    X = X.ix[arr, ]
    y = y[arr]

    skf = cross_validation.StratifiedKFold(y, n_folds=7)
    blend_train = np.zeros(X.shape[0])
    prediction = []
    blend_test_j = np.zeros((test.shape[0], len(skf)))

    for i, (train_index, cv_index) in enumerate(skf):
        print "Fold:", i
        X_train = X.ix[train_index, ]
        y_train = y[train_index]
        X_cv = X.ix[cv_index, ]
        #y_cv = y[cv_index]
        clf.fit(X_train, y_train)

        blend_train[cv_index] = clf.predict_proba(X_cv)[:, 1]
        blend_test_j[:, i] = clf.predict_proba(test)[:, 1]
    prediction = blend_test_j.mean(1)

    return prediction
skf = StratifiedKFold(train['signal'],
                      n_folds=5,
                      indices=None,
                      shuffle=True,
                      random_state=42)

print("Eliminate SPDhits, which makes the agreement check fail")
features = list(train.columns[1:-5])
print("Train a UGradientBoostingClassifier")
loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
clf = UGradientBoostingClassifier(
    loss=loss,
    n_estimators=150,
    subsample=0.1,  # n_estimators = 75
    max_depth=7,
    min_samples_leaf=10,
    learning_rate=0.1,
    train_features=features,
    random_state=11)

# clf = CalibratedClassifierCV(clf, method='isotonic', cv = skf)

clf.fit(train[features + ['mass']], train['signal'])

fb_preds = clf.predict_proba(test[features])[:, 1]
print 'saving fb'
temp = pd.DataFrame({'id': test['id'], 'prediction': fb_preds})
temp.to_csv('parts/fb.csv', index=False)

print("Train a Random Forest model")
Ejemplo n.º 12
0
#rf1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", max_depth=6, random_state=1)
#rf1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", 
#      oob_score = True, class_weight = "subsample",
#max_depth=10, max_features=6, min_samples_leaf=2, random_state=1)
rf1 = RandomForestClassifier(n_estimators=600, n_jobs=4, criterion="entropy", 
      #oob_score = True, class_weight = "subsample",
      max_depth=None, max_features=9, min_samples_leaf=2, 
      min_samples_split=2, random_state=1)
rf1.fit(train[features], train["signal"])
#rf = ensemble.AdaBoostClassifier(n_estimators=50, learning_rate=0.098643,base_estimator=rf1)
#uniform_features  = ["mass"]
print("Train a UGradientBoostingClassifier")
loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
#loss = KnnFlatnessLossFunction(uniform_features, uniform_label=0)
rf = UGradientBoostingClassifier(loss=loss, n_estimators=500,  
                                  max_depth=6,
                                  #max_depth=7, min_samples_leaf=10,
                                  learning_rate=0.15, train_features=features, subsample=0.7, random_state=369)
rf.fit(train[features + ['mass']], train['signal'])

#loss_funct=LogLossFunction()
#rf=UGradientBoostingClassifier(loss=loss_funct,n_estimators=200, random_state=3,learning_rate=0.2,subsample=0.7)
#rf.fit(train[features],train["signal"])

print("Train a XGBoost model")
#params = {"objective": "binary:logistic",
#          "learning_rate": 0.2,
#          "max_depth": 6,
#          "min_child_weight": 3,
#          "silent": 1,
#          "subsample": 0.7,
#          "colsample_bytree": 0.7,
Ejemplo n.º 13
0
    rf1 = RandomForestClassifier(n_estimators=500,
                                 n_jobs=-1,
                                 criterion="entropy",
                                 max_depth=10,
                                 max_features=6,
                                 min_samples_leaf=2)

    rf1.fit(train[features], train["signal"])
    print("Train a UGradientBoostingClassifier")
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)

    rf = UGradientBoostingClassifier(loss=loss,
                                     n_estimators=200,
                                     max_depth=6,
                                     learning_rate=0.15,
                                     train_features=features,
                                     subsample=0.7,
                                     random_state=369)
    rf.fit(train[features + ['mass']], train['signal'])

    print("Train a XGBoost model")
    params = {
        "objective": "binary:logistic",
        "learning_rate": 0.2,
        "max_depth": 6,
        "min_child_weight": 3,
        "silent": 1,
        "subsample": 0.7,
        "colsample_bytree": 0.7,
        "seed": 1
X, scaler = preprocess_data(X, scaler)
predskeras = model.predict(X, batch_size=256)[:, 1]

print("Load the training/test data using pandas")
train = pd.read_csv("../DATA/training.csv")
test  = pd.read_csv("../DATA/test.csv")

train = mypreprocessing(train)
test = mypreprocessing(test)

print("Eliminate SPDhits, which makes the agreement check fail")
features = list(train.columns[1:-5])
print("Train a UGradientBoostingClassifier")
loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
clf = UGradientBoostingClassifier(loss=loss, n_estimators=50, subsample=0.1, 
                                  max_depth=6, min_samples_leaf=10,
                                  learning_rate=0.1, train_features=features, random_state=11)
clf.fit(train[features + ['mass']], train['signal'])
fb_preds = clf.predict_proba(test[features])[:,1]
print("Train a Random Forest model")
rf = RandomForestClassifier(n_estimators=400, n_jobs=-1, criterion="entropy", random_state=1)
rf.fit(train[features], train["signal"])

print("Train a XGBoost model")
params = {"objective": "binary:logistic",
          "eta": 0.2,
          "max_depth": 6,
          "min_child_weight": 1,
          "silent": 1,
          "colsample_bytree": 0.8,
          "seed": 1}
Ejemplo n.º 15
0
X_train = train[features]
y_train = train['signal']

X_val = test[features]
y_val = test['signal']

loss = BinFlatnessLossFunction(['mass'],
                               n_bins=15,
                               uniform_label=0,
                               fl_coefficient=15,
                               power=2)
ugbc = UGradientBoostingClassifier(loss=loss,
                                   n_estimators=550,
                                   max_depth=6,
                                   learning_rate=0.15,
                                   train_features=features,
                                   subsample=0.7,
                                   random_state=123)
ugbc.fit(train[features + ['mass']], train['signal'])
pred_raw = ugbc.predict(test[features])
#print(pred_raw)
pred = pd.DataFrame(data={'signal': pred_raw})
#print(pred.head(5))
#accuracy_fn(pred,y_val)
#print(pred_raw.sum())
print(((pred_raw == y_val) & y_val).sum())
print(y_val.sum())
#print(pred['signal'].sum())
#print((pred['signal']==test['signal']).sum())
# print 'saving keras'
# temp = pd.DataFrame({'id': test['id'], 'prediction': predskeras})
# temp.to_csv('parts/keras.csv', index=False)


skf = StratifiedKFold(train["signal"], n_folds=5, indices=None, shuffle=True, random_state=42)

print ("Eliminate SPDhits, which makes the agreement check fail")
features = list(train.columns[1:-5])
print ("Train a UGradientBoostingClassifier")
loss = BinFlatnessLossFunction(["mass"], n_bins=15, uniform_label=0)
clf = UGradientBoostingClassifier(
    loss=loss,
    n_estimators=150,
    subsample=0.1,  # n_estimators = 75
    max_depth=7,
    min_samples_leaf=10,
    learning_rate=0.1,
    train_features=features,
    random_state=11,
)


# clf = CalibratedClassifierCV(clf, method='isotonic', cv = skf)

clf.fit(train[features + ["mass"]], train["signal"])

fb_preds = clf.predict_proba(test[features])[:, 1]
print "saving fb"
temp = pd.DataFrame({"id": test["id"], "prediction": fb_preds})
temp.to_csv("parts/fb.csv", index=False)
Ejemplo n.º 17
0
print("Train a Random Fores and gradient boos model model")
"""
gd = GradientBoostingClassifier(n_estimators=100, random_state=5,learning_rate=0.25123,subsample=0.7,max_features=34)


rf = RandomForestClassifier(n_estimators=100,random_state=5)
ada= AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100,random_state=5),
                        n_estimators=600, random_state=5,learning_rate=0.2)
ada.fit(train[features],train["signal"])
rf.fit(train[features],train["signal"])

"""
print("train a UBoost classifier")
loss_funct=BinFlatnessLossFunction(uniform_features=["mass"],uniform_label=0,n_bins=10)
ub=UGradientBoostingClassifier(loss=loss_funct,n_estimators=100, random_state=3,learning_rate=0.2,subsample=0.7)
ub.fit(train[features],train["signal"])

print("train a Gradientboost classifier")
gb=GradientBoostingClassifier(n_estimators=120, random_state=3,learning_rate=0.2,subsample=0.7,max_features=34)
gb.fit(train[features[0:-1]],train["signal"])

print("loading aggrement data")
check_agreement = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_agreement.csv', index_col='id')

print("calculating agreement probs")
agreement_probs = 0.5*ub.predict_proba(check_agreement[features[0:-1]])[:, 1]+0.5*gb.predict_proba(check_agreement[features[0:-1]])[:, 1] 

ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
Ejemplo n.º 18
0
  train = shuffle(train)
  print("Train a Random Forest model")

  rf1 = RandomForestClassifier(n_estimators=500, 
    n_jobs=-1, 
    criterion="entropy", 
    max_depth=10, 
    max_features=6, 
    min_samples_leaf=2)

  rf1.fit(train[features], train["signal"])
  print("Train a UGradientBoostingClassifier")
  loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)

  rf = UGradientBoostingClassifier(loss=loss, n_estimators=200,  
                                    max_depth=6,
                                    learning_rate=0.15, train_features=features, subsample=0.7, random_state=369)
  rf.fit(train[features + ['mass']], train['signal'])

  print("Train a XGBoost model")
  params = {"objective": "binary:logistic",
            "learning_rate": 0.2,
            "max_depth": 6,
            "min_child_weight": 3,
            "silent": 1,
            "subsample": 0.7,
            "colsample_bytree": 0.7,
            "seed": 1}
            
  num_trees=400
Ejemplo n.º 19
0
def stacked_models(train, features, test, in_sample=True):
    """
    Build stacked generalization models, set in_sample to False
    to predict on test set.
    """

    if in_sample:

        np.random.seed(1)
        new_indices = np.asarray(train.index.copy())
        np.random.shuffle(new_indices)

        train = train.iloc[new_indices].reset_index(drop=True).copy()

        # not used in CV testing..
        del test

        cutoff = int(new_indices.shape[0] * 0.75)

        X_dev = train[:cutoff].reset_index(drop=True).copy()
        Y_dev = train[:cutoff]['signal'].reset_index(drop=True).copy()

        X_test = train[cutoff:][
            train[cutoff:]['min_ANNmuon'] > 0.4].reset_index(drop=True).copy()
        Y_test = train[cutoff:][
            train[cutoff:]['min_ANNmuon'] > 0.4]['signal'].reset_index(
                drop=True).copy()

    else:
        np.random.seed(1)
        new_indices = np.asarray(train.index.copy())
        np.random.shuffle(new_indices)

        train = train.iloc[new_indices].reset_index(drop=True).copy()

        X_dev = train.reset_index(drop=True).copy()
        Y_dev = train['signal'].reset_index(drop=True).copy()

        X_test = test.reset_index(drop=True).copy()
        Y_test = None

    n_folds = 5

    # put ur parameter tuned CLFs in this list.

    clfs = [
        RandomForestClassifier(n_estimators=200,
                               criterion='entropy',
                               random_state=20,
                               n_jobs=-1),
        RandomForestClassifier(n_estimators=200,
                               criterion='entropy',
                               random_state=20,
                               n_jobs=-1,
                               max_depth=6),
        ExtraTreesClassifier(n_estimators=200,
                             criterion='entropy',
                             random_state=50,
                             n_jobs=-1),
        ExtraTreesClassifier(n_estimators=200,
                             criterion='entropy',
                             random_state=50,
                             n_jobs=-1,
                             max_depth=6),
        Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]),
        UGradientBoostingClassifier(loss=BinFlatnessLossFunction(
            ['mass'], n_bins=15, uniform_label=0),
                                    n_estimators=150,
                                    subsample=0.1,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(
            ['mass'], n_neighbours=30, uniform_label=0),
                                    n_estimators=150,
                                    subsample=0.1,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=BinFlatnessLossFunction(
            ['mass'], n_bins=15, uniform_label=0),
                                    n_estimators=100,
                                    subsample=0.8,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        UGradientBoostingClassifier(loss=KnnFlatnessLossFunction(
            ['mass'], n_neighbours=30, uniform_label=0),
                                    n_estimators=100,
                                    subsample=0.8,
                                    max_depth=6,
                                    min_samples_leaf=10,
                                    learning_rate=0.1,
                                    train_features=features,
                                    random_state=11),
        XGBoostClassifier(eval_metric='auc',
                          objective='binary:logistic',
                          num_class=2,
                          nthread=4,
                          silent=1,
                          colsample_bytree=0.6,
                          eta=0.005,
                          max_depth=6,
                          min_child_weight=13,
                          seed=1337,
                          subsample=0.7),
        NN1(len(features)),
        NN2(len(features)),
        NN3(len(features)),
        NN4(len(features))
    ]

    skf = list(StratifiedKFold(Y_dev, n_folds))

    # Number of training data x Number of classifiers
    blend_train = np.zeros((X_dev.shape[0], len(clfs)))
    # Number of testing data x Number of classifiers
    blend_test = np.zeros((X_test.shape[0], len(clfs)))

    print 'X_test.shape = %s' % (str(X_test.shape))
    print 'blend_train.shape = %s' % (str(blend_train.shape))
    print 'blend_test.shape = %s' % (str(blend_test.shape))

    # For each classifier, we train the number of fold times (=len(skf))
    for j, clf in enumerate(clfs):
        print 'Training classifier [%s]' % (j)
        # Number of testing data x Number of folds , we will take the mean of
        # the predictions later
        blend_test_j = np.zeros((X_test.shape[0], len(skf)))
        for i, (train_index, cv_index) in enumerate(skf):
            print 'Fold [%s]' % (i)

            # This is the training and validation set
            X_train = X_dev.iloc[train_index].copy()
            Y_train = Y_dev.iloc[train_index].copy()
            X_cv = X_dev.iloc[cv_index].copy()
            Y_cv = Y_dev.iloc[cv_index].copy()

            # handle the case of hep.ml stuff
            if type(clf) == type(UGradientBoostingClassifier()):
                clf.fit(X_train[features + ['mass']],
                        Y_train.values.astype(np.int32))
            else:
                clf.fit(X_train[features], Y_train.values.astype(np.int32))

            # This output will be the basis for our blended classifier to train against,
            # which is also the output of our classifiers
            blend_train[cv_index, j] = clf.predict_proba(X_cv[features])[:, 1]
            blend_test_j[:, i] = clf.predict_proba(X_test[features])[:, 1]
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)

    print 'Y_dev.shape = %s' % (Y_dev.shape)

    # blend with LR...
    bclf = LogisticRegression()
    bclf.fit(blend_train, Y_dev)

    bclf2 = GradientBoostingClassifier(n_estimators=150,
                                       learning_rate=0.02,
                                       max_depth=4,
                                       subsample=0.9,
                                       verbose=3,
                                       random_state=1337)
    bclf2.fit(blend_train, Y_dev)

    bclf3 = NeuralNet(
        layers=[('input', layers.InputLayer), ('hidden', layers.DenseLayer),
                ('output', layers.DenseLayer)],

        # layer parameters:
        input_shape=(None, blend_train.shape[1]),
        hidden_num_units=blend_train.shape[1],
        output_nonlinearity=nonlinearities.
        softmax,  # output layer uses identity function
        output_num_units=2,  # 2 target values

        # optimization method:
        update=nesterov_momentum,
        update_learning_rate=0.01,
        update_momentum=0.9,
        regression=
        False,  # flag to indicate we're dealing with regression problem
        max_epochs=53,  # TRY 50 and 46 epochs!
        verbose=1,
        eval_size=0.10)

    bclf3.fit(blend_train.astype(np.float32), Y_dev.astype(np.int32))

    bclf4 = AdaBoostClassifier(n_estimators=400, random_state=88)
    bclf4.fit(blend_train, Y_dev)

    # Predict now
    Y_test_predict = bclf.predict_proba(blend_test)[:, 1]
    Y_test_predict2 = bclf2.predict_proba(blend_test)[:, 1]
    Y_test_predict3 = bclf3.predict_proba(blend_test.astype(np.float32))[:, 1]
    Y_test_predict4 = bclf4.predict_proba(blend_test)[:, 1]

    print 'Logit Coefs:', bclf.coef_
    if in_sample:
        score = evaluation.roc_auc_truncated(Y_test, Y_test_predict)
        score2 = evaluation.roc_auc_truncated(Y_test, Y_test_predict2)
        score3 = evaluation.roc_auc_truncated(Y_test, blend_test.mean(1))
        score4 = evaluation.roc_auc_truncated(
            Y_test, scipy_opt(blend_train, Y_dev, blend_test))
        score5 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2) / 2.0)
        score6 = evaluation.roc_auc_truncated(Y_test, Y_test_predict3)
        score7 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3) / 3.0)
        score8 = evaluation.roc_auc_truncated(Y_test, Y_test_predict4)
        score9 = evaluation.roc_auc_truncated(
            Y_test,
            (Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 3.0)
        score10 = evaluation.roc_auc_truncated(
            Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3 +
                     Y_test_predict4) / 4.0)

        print 'LR Score = %s' % (score)
        print 'GB Score = %s' % (score2)
        print 'MEAN Score = %s' % (score3)
        print 'Scipy Score = %s' % (score4)
        print 'LR + GB score = %s' % (score5)
        print 'ANN Score= %s' % (score6)
        print 'LR + GB + ANN Score = %s' % (score7)
        print 'ADA Score = %s' % (score8)
        print 'GB + ANN + ADA Score = %s' % (score9)
        print 'LR + GB + ANN + ADA Score = %s' % (score10)
        return blend_train, Y_dev, blend_test, Y_test

    # average of ADA, ANN and GBM.
    return (Y_test_predict + Y_test_predict2 + Y_test_predict3 +
            Y_test_predict4) / 4.0
def Model1():
    
# Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers
# which are trained using the stacked data    

    model = 1    # set the model number for feature engineering
    n_folds = 3 # set the number of folders for generating meta-features
    n_stack = 15  # number of models used for stacking
    
    train, test, features = utils.LoadData(model)  # load data and obtain the list of features for estimation
    
    # Initialize models for stacking
        
    clf1=KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)
                          
    clf2=KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None)
                          
    clf3=KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)  
                          
    clf4=KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None)
                          
    clf5=KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, 
                              p=2, metric='minkowski', metric_params=None) 

    clf6=KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30,  
                              p=2, metric='minkowski', metric_params=None)

    clf7=KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30,
                              p=2, metric='minkowski', metric_params=None)                          
                          
    clf8=LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True,
                            intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', 
                            max_iter=200, multi_class='ovr', verbose=0) 
                        
    clf9=GaussianNB()
                 
    clf10=SVC(C=5.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.008, shrinking=True, probability=True, 
              tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101)
               
    clf11=RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, 
                            min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, 
                            max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2,
                            random_state=101, verbose=0, warm_start=False, class_weight=None) 
                            
    clf12=ExtraTreesClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2,
                     min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7,
                     max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, 
                     random_state=101, verbose=0, warm_start=False, class_weight=None)

    clf13=GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=0.7, 
                                min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0,
                                max_depth=6, init=None, random_state=101, max_features=None, verbose=0,
                                max_leaf_nodes=None, warm_start=False)
                                
    clf14=SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True,
                        n_iter=10, shuffle=True, verbose=0, epsilon=0.1, n_jobs=2, random_state=101, 
                        learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False,
                        average=False) 

    clf15=models.XGBoostClassifier(nthread=2, eta=.2, gamma=0, max_depth=6, min_child_weight=3, max_delta_step=0,
                         subsample=0.7, colsample_bytree=0.7, silent =1, seed=101,
                         l2_reg=1, l1_reg=0, n_estimators=450)
                         
                               
    clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14, clf15]    
        
    # Construct stacked datasets
    train_blend, test_blend, train_probs, test_probs = utils.StackModels(train[features], test[features], 
                                                                         train.signal.values, clfs, n_folds)                                                                                      
                                                                             
    # Construct data for uniform boosting
    columns = ['p%s ' % (i) for i in range(0, n_stack)]
    meta_train = pd.DataFrame({columns[i]: train_probs[:, i] for i in range(0, n_stack)})
    meta_test = pd.DataFrame({columns[i]: test_probs[:, i] for i in range(0, n_stack)})
    train_ugb = pd.concat([train, meta_train], axis=1)
    test_ugb = pd.concat([test, meta_test], axis=1)
    features_ugb = features + columns               # features used for UGB training (original features + meta-features)

    # Initialize models for ensemble
    loss = BinFlatnessLossFunction(['mass'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0)
                                   
    clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, 
                            learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101)  
                            
    clf_xgb = models.XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, 
                                max_delta_step=0, subsample=0.8, colsample_bytree=0.3,  
                                silent =1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100)
                                
    clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, 
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, 
                                max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4,
                                random_state=101, verbose=0, warm_start=False, class_weight=None)

    # Train models
    print("Training a Uniform Gradient Boosting model")     
    clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal'])   
    preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:,1]
    
    print("Training a XGBoost model")     
    clf_xgb.fit(train_blend, train['signal'])
    preds_xgb = clf_xgb.predict_proba(test_blend)
        
    print("Training a Random Forest model") 
    clf_rf.fit(train_blend, train['signal'])
    preds_rf = clf_rf.predict_proba(test_blend)[:,1]
        
    # Compute ensemble predictions
    preds = 0.3*(preds_xgb**(0.65))*(preds_rf**(0.35)) + 0.7*preds_ugb
    
    return preds
def Model1():

    # Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers
    # which are trained using the stacked data

    model = 1  # set the model number for feature engineering
    n_folds = 3  # set the number of folders for generating meta-features
    n_stack = 15  # number of models used for stacking

    train, test, features = utils.LoadData(
        model)  # load data and obtain the list of features for estimation

    # Initialize models for stacking

    clf1 = KNeighborsClassifier(n_neighbors=5,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf2 = KNeighborsClassifier(n_neighbors=10,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf3 = KNeighborsClassifier(n_neighbors=20,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf4 = KNeighborsClassifier(n_neighbors=40,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf5 = KNeighborsClassifier(n_neighbors=80,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf6 = KNeighborsClassifier(n_neighbors=160,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf7 = KNeighborsClassifier(n_neighbors=320,
                                weights='uniform',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                metric_params=None)

    clf8 = LogisticRegression(penalty='l2',
                              dual=False,
                              tol=0.0001,
                              C=5.0,
                              fit_intercept=True,
                              intercept_scaling=1,
                              class_weight=None,
                              random_state=101,
                              solver='lbfgs',
                              max_iter=200,
                              multi_class='ovr',
                              verbose=0)

    clf9 = GaussianNB()

    clf10 = SVC(C=5.0,
                kernel='rbf',
                degree=3,
                gamma=0.0,
                coef0=0.008,
                shrinking=True,
                probability=True,
                tol=0.001,
                cache_size=200,
                class_weight=None,
                verbose=False,
                max_iter=-1,
                random_state=101)

    clf11 = RandomForestClassifier(n_estimators=250,
                                   criterion='gini',
                                   max_depth=6,
                                   min_samples_split=2,
                                   min_samples_leaf=5,
                                   min_weight_fraction_leaf=0.0,
                                   max_features=0.7,
                                   max_leaf_nodes=None,
                                   bootstrap=False,
                                   oob_score=False,
                                   n_jobs=2,
                                   random_state=101,
                                   verbose=0,
                                   warm_start=False,
                                   class_weight=None)

    clf12 = ExtraTreesClassifier(n_estimators=250,
                                 criterion='gini',
                                 max_depth=6,
                                 min_samples_split=2,
                                 min_samples_leaf=5,
                                 min_weight_fraction_leaf=0.0,
                                 max_features=0.7,
                                 max_leaf_nodes=None,
                                 bootstrap=False,
                                 oob_score=False,
                                 n_jobs=2,
                                 random_state=101,
                                 verbose=0,
                                 warm_start=False,
                                 class_weight=None)

    clf13 = GradientBoostingClassifier(loss='deviance',
                                       learning_rate=0.2,
                                       n_estimators=450,
                                       subsample=0.7,
                                       min_samples_split=2,
                                       min_samples_leaf=5,
                                       min_weight_fraction_leaf=0.0,
                                       max_depth=6,
                                       init=None,
                                       random_state=101,
                                       max_features=None,
                                       verbose=0,
                                       max_leaf_nodes=None,
                                       warm_start=False)

    clf14 = SGDClassifier(loss='log',
                          penalty='l2',
                          alpha=0.0001,
                          l1_ratio=0.15,
                          fit_intercept=True,
                          n_iter=10,
                          shuffle=True,
                          verbose=0,
                          epsilon=0.1,
                          n_jobs=2,
                          random_state=101,
                          learning_rate='optimal',
                          eta0=0.0,
                          power_t=0.5,
                          class_weight=None,
                          warm_start=False,
                          average=False)

    clf15 = models.XGBoostClassifier(nthread=2,
                                     eta=.2,
                                     gamma=0,
                                     max_depth=6,
                                     min_child_weight=3,
                                     max_delta_step=0,
                                     subsample=0.7,
                                     colsample_bytree=0.7,
                                     silent=1,
                                     seed=101,
                                     l2_reg=1,
                                     l1_reg=0,
                                     n_estimators=450)

    clfs = [
        clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11,
        clf12, clf13, clf14, clf15
    ]

    # Construct stacked datasets
    train_blend, test_blend, train_probs, test_probs = utils.StackModels(
        train[features], test[features], train.signal.values, clfs, n_folds)

    # Construct data for uniform boosting
    columns = ['p%s ' % (i) for i in range(0, n_stack)]
    meta_train = pd.DataFrame(
        {columns[i]: train_probs[:, i]
         for i in range(0, n_stack)})
    meta_test = pd.DataFrame(
        {columns[i]: test_probs[:, i]
         for i in range(0, n_stack)})
    train_ugb = pd.concat([train, meta_train], axis=1)
    test_ugb = pd.concat([test, meta_test], axis=1)
    features_ugb = features + columns  # features used for UGB training (original features + meta-features)

    # Initialize models for ensemble
    loss = BinFlatnessLossFunction(['mass'],
                                   n_bins=20,
                                   power=1,
                                   fl_coefficient=3,
                                   uniform_label=0)

    clf_ugb = UGradientBoostingClassifier(loss=loss,
                                          n_estimators=275,
                                          max_depth=11,
                                          min_samples_leaf=3,
                                          learning_rate=0.03,
                                          train_features=features_ugb,
                                          subsample=0.85,
                                          random_state=101)

    clf_xgb = models.XGBoostClassifier(nthread=6,
                                       eta=.0225,
                                       gamma=1.225,
                                       max_depth=11,
                                       min_child_weight=10,
                                       max_delta_step=0,
                                       subsample=0.8,
                                       colsample_bytree=0.3,
                                       silent=1,
                                       seed=101,
                                       l2_reg=1,
                                       l1_reg=0,
                                       n_estimators=1100)

    clf_rf = RandomForestClassifier(n_estimators=375,
                                    criterion='gini',
                                    max_depth=10,
                                    min_samples_split=6,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0,
                                    max_features=0.6,
                                    max_leaf_nodes=None,
                                    bootstrap=True,
                                    oob_score=False,
                                    n_jobs=4,
                                    random_state=101,
                                    verbose=0,
                                    warm_start=False,
                                    class_weight=None)

    # Train models
    print("Training a Uniform Gradient Boosting model")
    clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal'])
    preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:, 1]

    print("Training a XGBoost model")
    clf_xgb.fit(train_blend, train['signal'])
    preds_xgb = clf_xgb.predict_proba(test_blend)

    print("Training a Random Forest model")
    clf_rf.fit(train_blend, train['signal'])
    preds_rf = clf_rf.predict_proba(test_blend)[:, 1]

    # Compute ensemble predictions
    preds = 0.3 * (preds_xgb**(0.65)) * (preds_rf**(0.35)) + 0.7 * preds_ugb

    return preds