def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = CompositeLossFunction() loss4 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss)
def flatnessloss(X,y,test): features = list(X.columns) features.remove('mass') loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier(loss=loss, n_estimators=300, subsample=0.7, max_depth=9, min_samples_leaf=8, learning_rate=0.1, train_features=features, random_state=11) arr = np.random.permutation(X.shape[0]) X = X.ix[arr,] y = y[arr] skf = cross_validation.StratifiedKFold(y,n_folds = 7) blend_train = np.zeros(X.shape[0]) prediction = [] blend_test_j = np.zeros((test.shape[0], len(skf))) for i,(train_index,cv_index) in enumerate(skf): print "Fold:",i X_train = X.ix[train_index,] y_train = y[train_index] X_cv = X.ix[cv_index,] #y_cv = y[cv_index] clf.fit(X_train,y_train) blend_train[cv_index] = clf.predict_proba(X_cv)[:,1] blend_test_j[:,i] = clf.predict_proba(test)[:,1] prediction = blend_test_j.mean(1) return prediction
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6): """ Testing how classifiers work with highly misbalanced (in the terms of weights) datasets. """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) trainW = trainY * 10000 + 1 testW = testY * 10000 + 1 for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY, sample_weight=trainW) p = clf.predict_proba(testX) assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [ losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request') ]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list( trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format( roc_auc, loss)
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6): """ Testing with two main classification losses. Also testing copying """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) for loss in [LogLossFunction(), AdaLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX)) assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low' # checking clonability _ = clone(clf) clf_copy = copy.deepcopy(clf) assert numpy.all( clf.predict_proba(trainX) == clf_copy.predict_proba( trainX)), 'copied classifier is different'
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = CompositeLossFunction() loss4 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss)
def test_gradient_boosting(n_samples=1000, distance = 0.6): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, knn=5, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, knn=5, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=1., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=1., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=1., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=1., uniform_label=[0, 1]) for loss in [loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) clf.fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request')]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list(trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(roc_auc, loss)
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6): testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) for loss in [LogLossFunction(), AdaLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX)) assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low' # checking clonability _ = clone(clf) clf_copy = copy.deepcopy(clf) assert (clf.predict_proba(trainX) == clf_copy.predict_proba(trainX)).all(), 'copied classifier is different'
def flatnessloss(X, y, test): features = list(X.columns) features.remove('mass') loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier(loss=loss, n_estimators=300, subsample=0.7, max_depth=9, min_samples_leaf=8, learning_rate=0.1, train_features=features, random_state=11) arr = np.random.permutation(X.shape[0]) X = X.ix[arr, ] y = y[arr] skf = cross_validation.StratifiedKFold(y, n_folds=7) blend_train = np.zeros(X.shape[0]) prediction = [] blend_test_j = np.zeros((test.shape[0], len(skf))) for i, (train_index, cv_index) in enumerate(skf): print "Fold:", i X_train = X.ix[train_index, ] y_train = y[train_index] X_cv = X.ix[cv_index, ] #y_cv = y[cv_index] clf.fit(X_train, y_train) blend_train[cv_index] = clf.predict_proba(X_cv)[:, 1] blend_test_j[:, i] = clf.predict_proba(test)[:, 1] prediction = blend_test_j.mean(1) return prediction
skf = StratifiedKFold(train['signal'], n_folds=5, indices=None, shuffle=True, random_state=42) print("Eliminate SPDhits, which makes the agreement check fail") features = list(train.columns[1:-5]) print("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier( loss=loss, n_estimators=150, subsample=0.1, # n_estimators = 75 max_depth=7, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11) # clf = CalibratedClassifierCV(clf, method='isotonic', cv = skf) clf.fit(train[features + ['mass']], train['signal']) fb_preds = clf.predict_proba(test[features])[:, 1] print 'saving fb' temp = pd.DataFrame({'id': test['id'], 'prediction': fb_preds}) temp.to_csv('parts/fb.csv', index=False) print("Train a Random Forest model")
#rf1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", max_depth=6, random_state=1) #rf1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", # oob_score = True, class_weight = "subsample", #max_depth=10, max_features=6, min_samples_leaf=2, random_state=1) rf1 = RandomForestClassifier(n_estimators=600, n_jobs=4, criterion="entropy", #oob_score = True, class_weight = "subsample", max_depth=None, max_features=9, min_samples_leaf=2, min_samples_split=2, random_state=1) rf1.fit(train[features], train["signal"]) #rf = ensemble.AdaBoostClassifier(n_estimators=50, learning_rate=0.098643,base_estimator=rf1) #uniform_features = ["mass"] print("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) #loss = KnnFlatnessLossFunction(uniform_features, uniform_label=0) rf = UGradientBoostingClassifier(loss=loss, n_estimators=500, max_depth=6, #max_depth=7, min_samples_leaf=10, learning_rate=0.15, train_features=features, subsample=0.7, random_state=369) rf.fit(train[features + ['mass']], train['signal']) #loss_funct=LogLossFunction() #rf=UGradientBoostingClassifier(loss=loss_funct,n_estimators=200, random_state=3,learning_rate=0.2,subsample=0.7) #rf.fit(train[features],train["signal"]) print("Train a XGBoost model") #params = {"objective": "binary:logistic", # "learning_rate": 0.2, # "max_depth": 6, # "min_child_weight": 3, # "silent": 1, # "subsample": 0.7, # "colsample_bytree": 0.7,
rf1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", max_depth=10, max_features=6, min_samples_leaf=2) rf1.fit(train[features], train["signal"]) print("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) rf = UGradientBoostingClassifier(loss=loss, n_estimators=200, max_depth=6, learning_rate=0.15, train_features=features, subsample=0.7, random_state=369) rf.fit(train[features + ['mass']], train['signal']) print("Train a XGBoost model") params = { "objective": "binary:logistic", "learning_rate": 0.2, "max_depth": 6, "min_child_weight": 3, "silent": 1, "subsample": 0.7, "colsample_bytree": 0.7, "seed": 1
X, scaler = preprocess_data(X, scaler) predskeras = model.predict(X, batch_size=256)[:, 1] print("Load the training/test data using pandas") train = pd.read_csv("../DATA/training.csv") test = pd.read_csv("../DATA/test.csv") train = mypreprocessing(train) test = mypreprocessing(test) print("Eliminate SPDhits, which makes the agreement check fail") features = list(train.columns[1:-5]) print("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier(loss=loss, n_estimators=50, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11) clf.fit(train[features + ['mass']], train['signal']) fb_preds = clf.predict_proba(test[features])[:,1] print("Train a Random Forest model") rf = RandomForestClassifier(n_estimators=400, n_jobs=-1, criterion="entropy", random_state=1) rf.fit(train[features], train["signal"]) print("Train a XGBoost model") params = {"objective": "binary:logistic", "eta": 0.2, "max_depth": 6, "min_child_weight": 1, "silent": 1, "colsample_bytree": 0.8, "seed": 1}
X_train = train[features] y_train = train['signal'] X_val = test[features] y_val = test['signal'] loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0, fl_coefficient=15, power=2) ugbc = UGradientBoostingClassifier(loss=loss, n_estimators=550, max_depth=6, learning_rate=0.15, train_features=features, subsample=0.7, random_state=123) ugbc.fit(train[features + ['mass']], train['signal']) pred_raw = ugbc.predict(test[features]) #print(pred_raw) pred = pd.DataFrame(data={'signal': pred_raw}) #print(pred.head(5)) #accuracy_fn(pred,y_val) #print(pred_raw.sum()) print(((pred_raw == y_val) & y_val).sum()) print(y_val.sum()) #print(pred['signal'].sum()) #print((pred['signal']==test['signal']).sum())
# print 'saving keras' # temp = pd.DataFrame({'id': test['id'], 'prediction': predskeras}) # temp.to_csv('parts/keras.csv', index=False) skf = StratifiedKFold(train["signal"], n_folds=5, indices=None, shuffle=True, random_state=42) print ("Eliminate SPDhits, which makes the agreement check fail") features = list(train.columns[1:-5]) print ("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(["mass"], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier( loss=loss, n_estimators=150, subsample=0.1, # n_estimators = 75 max_depth=7, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11, ) # clf = CalibratedClassifierCV(clf, method='isotonic', cv = skf) clf.fit(train[features + ["mass"]], train["signal"]) fb_preds = clf.predict_proba(test[features])[:, 1] print "saving fb" temp = pd.DataFrame({"id": test["id"], "prediction": fb_preds}) temp.to_csv("parts/fb.csv", index=False)
print("Train a Random Fores and gradient boos model model") """ gd = GradientBoostingClassifier(n_estimators=100, random_state=5,learning_rate=0.25123,subsample=0.7,max_features=34) rf = RandomForestClassifier(n_estimators=100,random_state=5) ada= AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100,random_state=5), n_estimators=600, random_state=5,learning_rate=0.2) ada.fit(train[features],train["signal"]) rf.fit(train[features],train["signal"]) """ print("train a UBoost classifier") loss_funct=BinFlatnessLossFunction(uniform_features=["mass"],uniform_label=0,n_bins=10) ub=UGradientBoostingClassifier(loss=loss_funct,n_estimators=100, random_state=3,learning_rate=0.2,subsample=0.7) ub.fit(train[features],train["signal"]) print("train a Gradientboost classifier") gb=GradientBoostingClassifier(n_estimators=120, random_state=3,learning_rate=0.2,subsample=0.7,max_features=34) gb.fit(train[features[0:-1]],train["signal"]) print("loading aggrement data") check_agreement = pd.read_csv('C:/Users/sony/Downloads/Compressed/CERN/check_agreement.csv', index_col='id') print("calculating agreement probs") agreement_probs = 0.5*ub.predict_proba(check_agreement[features[0:-1]])[:, 1]+0.5*gb.predict_proba(check_agreement[features[0:-1]])[:, 1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1],
train = shuffle(train) print("Train a Random Forest model") rf1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", max_depth=10, max_features=6, min_samples_leaf=2) rf1.fit(train[features], train["signal"]) print("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) rf = UGradientBoostingClassifier(loss=loss, n_estimators=200, max_depth=6, learning_rate=0.15, train_features=features, subsample=0.7, random_state=369) rf.fit(train[features + ['mass']], train['signal']) print("Train a XGBoost model") params = {"objective": "binary:logistic", "learning_rate": 0.2, "max_depth": 6, "min_child_weight": 3, "silent": 1, "subsample": 0.7, "colsample_bytree": 0.7, "seed": 1} num_trees=400
def stacked_models(train, features, test, in_sample=True): """ Build stacked generalization models, set in_sample to False to predict on test set. """ if in_sample: np.random.seed(1) new_indices = np.asarray(train.index.copy()) np.random.shuffle(new_indices) train = train.iloc[new_indices].reset_index(drop=True).copy() # not used in CV testing.. del test cutoff = int(new_indices.shape[0] * 0.75) X_dev = train[:cutoff].reset_index(drop=True).copy() Y_dev = train[:cutoff]['signal'].reset_index(drop=True).copy() X_test = train[cutoff:][ train[cutoff:]['min_ANNmuon'] > 0.4].reset_index(drop=True).copy() Y_test = train[cutoff:][ train[cutoff:]['min_ANNmuon'] > 0.4]['signal'].reset_index( drop=True).copy() else: np.random.seed(1) new_indices = np.asarray(train.index.copy()) np.random.shuffle(new_indices) train = train.iloc[new_indices].reset_index(drop=True).copy() X_dev = train.reset_index(drop=True).copy() Y_dev = train['signal'].reset_index(drop=True).copy() X_test = test.reset_index(drop=True).copy() Y_test = None n_folds = 5 # put ur parameter tuned CLFs in this list. clfs = [ RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1), RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1, max_depth=6), ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1), ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1, max_depth=6), Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]), UGradientBoostingClassifier(loss=BinFlatnessLossFunction( ['mass'], n_bins=15, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=KnnFlatnessLossFunction( ['mass'], n_neighbours=30, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=BinFlatnessLossFunction( ['mass'], n_bins=15, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=KnnFlatnessLossFunction( ['mass'], n_neighbours=30, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), XGBoostClassifier(eval_metric='auc', objective='binary:logistic', num_class=2, nthread=4, silent=1, colsample_bytree=0.6, eta=0.005, max_depth=6, min_child_weight=13, seed=1337, subsample=0.7), NN1(len(features)), NN2(len(features)), NN3(len(features)), NN4(len(features)) ] skf = list(StratifiedKFold(Y_dev, n_folds)) # Number of training data x Number of classifiers blend_train = np.zeros((X_dev.shape[0], len(clfs))) # Number of testing data x Number of classifiers blend_test = np.zeros((X_test.shape[0], len(clfs))) print 'X_test.shape = %s' % (str(X_test.shape)) print 'blend_train.shape = %s' % (str(blend_train.shape)) print 'blend_test.shape = %s' % (str(blend_test.shape)) # For each classifier, we train the number of fold times (=len(skf)) for j, clf in enumerate(clfs): print 'Training classifier [%s]' % (j) # Number of testing data x Number of folds , we will take the mean of # the predictions later blend_test_j = np.zeros((X_test.shape[0], len(skf))) for i, (train_index, cv_index) in enumerate(skf): print 'Fold [%s]' % (i) # This is the training and validation set X_train = X_dev.iloc[train_index].copy() Y_train = Y_dev.iloc[train_index].copy() X_cv = X_dev.iloc[cv_index].copy() Y_cv = Y_dev.iloc[cv_index].copy() # handle the case of hep.ml stuff if type(clf) == type(UGradientBoostingClassifier()): clf.fit(X_train[features + ['mass']], Y_train.values.astype(np.int32)) else: clf.fit(X_train[features], Y_train.values.astype(np.int32)) # This output will be the basis for our blended classifier to train against, # which is also the output of our classifiers blend_train[cv_index, j] = clf.predict_proba(X_cv[features])[:, 1] blend_test_j[:, i] = clf.predict_proba(X_test[features])[:, 1] # Take the mean of the predictions of the cross validation set blend_test[:, j] = blend_test_j.mean(1) print 'Y_dev.shape = %s' % (Y_dev.shape) # blend with LR... bclf = LogisticRegression() bclf.fit(blend_train, Y_dev) bclf2 = GradientBoostingClassifier(n_estimators=150, learning_rate=0.02, max_depth=4, subsample=0.9, verbose=3, random_state=1337) bclf2.fit(blend_train, Y_dev) bclf3 = NeuralNet( layers=[('input', layers.InputLayer), ('hidden', layers.DenseLayer), ('output', layers.DenseLayer)], # layer parameters: input_shape=(None, blend_train.shape[1]), hidden_num_units=blend_train.shape[1], output_nonlinearity=nonlinearities. softmax, # output layer uses identity function output_num_units=2, # 2 target values # optimization method: update=nesterov_momentum, update_learning_rate=0.01, update_momentum=0.9, regression= False, # flag to indicate we're dealing with regression problem max_epochs=53, # TRY 50 and 46 epochs! verbose=1, eval_size=0.10) bclf3.fit(blend_train.astype(np.float32), Y_dev.astype(np.int32)) bclf4 = AdaBoostClassifier(n_estimators=400, random_state=88) bclf4.fit(blend_train, Y_dev) # Predict now Y_test_predict = bclf.predict_proba(blend_test)[:, 1] Y_test_predict2 = bclf2.predict_proba(blend_test)[:, 1] Y_test_predict3 = bclf3.predict_proba(blend_test.astype(np.float32))[:, 1] Y_test_predict4 = bclf4.predict_proba(blend_test)[:, 1] print 'Logit Coefs:', bclf.coef_ if in_sample: score = evaluation.roc_auc_truncated(Y_test, Y_test_predict) score2 = evaluation.roc_auc_truncated(Y_test, Y_test_predict2) score3 = evaluation.roc_auc_truncated(Y_test, blend_test.mean(1)) score4 = evaluation.roc_auc_truncated( Y_test, scipy_opt(blend_train, Y_dev, blend_test)) score5 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2) / 2.0) score6 = evaluation.roc_auc_truncated(Y_test, Y_test_predict3) score7 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3) / 3.0) score8 = evaluation.roc_auc_truncated(Y_test, Y_test_predict4) score9 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 3.0) score10 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0) print 'LR Score = %s' % (score) print 'GB Score = %s' % (score2) print 'MEAN Score = %s' % (score3) print 'Scipy Score = %s' % (score4) print 'LR + GB score = %s' % (score5) print 'ANN Score= %s' % (score6) print 'LR + GB + ANN Score = %s' % (score7) print 'ADA Score = %s' % (score8) print 'GB + ANN + ADA Score = %s' % (score9) print 'LR + GB + ANN + ADA Score = %s' % (score10) return blend_train, Y_dev, blend_test, Y_test # average of ADA, ANN and GBM. return (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0
def Model1(): # Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers # which are trained using the stacked data model = 1 # set the model number for feature engineering n_folds = 3 # set the number of folders for generating meta-features n_stack = 15 # number of models used for stacking train, test, features = utils.LoadData(model) # load data and obtain the list of features for estimation # Initialize models for stacking clf1=KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf2=KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf3=KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf4=KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf5=KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf6=KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf7=KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf8=LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', max_iter=200, multi_class='ovr', verbose=0) clf9=GaussianNB() clf10=SVC(C=5.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.008, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101) clf11=RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf12=ExtraTreesClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf13=GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=0.7, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_depth=6, init=None, random_state=101, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) clf14=SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=10, shuffle=True, verbose=0, epsilon=0.1, n_jobs=2, random_state=101, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False) clf15=models.XGBoostClassifier(nthread=2, eta=.2, gamma=0, max_depth=6, min_child_weight=3, max_delta_step=0, subsample=0.7, colsample_bytree=0.7, silent =1, seed=101, l2_reg=1, l1_reg=0, n_estimators=450) clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14, clf15] # Construct stacked datasets train_blend, test_blend, train_probs, test_probs = utils.StackModels(train[features], test[features], train.signal.values, clfs, n_folds) # Construct data for uniform boosting columns = ['p%s ' % (i) for i in range(0, n_stack)] meta_train = pd.DataFrame({columns[i]: train_probs[:, i] for i in range(0, n_stack)}) meta_test = pd.DataFrame({columns[i]: test_probs[:, i] for i in range(0, n_stack)}) train_ugb = pd.concat([train, meta_train], axis=1) test_ugb = pd.concat([test, meta_test], axis=1) features_ugb = features + columns # features used for UGB training (original features + meta-features) # Initialize models for ensemble loss = BinFlatnessLossFunction(['mass'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0) clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101) clf_xgb = models.XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, max_delta_step=0, subsample=0.8, colsample_bytree=0.3, silent =1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100) clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=101, verbose=0, warm_start=False, class_weight=None) # Train models print("Training a Uniform Gradient Boosting model") clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal']) preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:,1] print("Training a XGBoost model") clf_xgb.fit(train_blend, train['signal']) preds_xgb = clf_xgb.predict_proba(test_blend) print("Training a Random Forest model") clf_rf.fit(train_blend, train['signal']) preds_rf = clf_rf.predict_proba(test_blend)[:,1] # Compute ensemble predictions preds = 0.3*(preds_xgb**(0.65))*(preds_rf**(0.35)) + 0.7*preds_ugb return preds
def Model1(): # Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers # which are trained using the stacked data model = 1 # set the model number for feature engineering n_folds = 3 # set the number of folders for generating meta-features n_stack = 15 # number of models used for stacking train, test, features = utils.LoadData( model) # load data and obtain the list of features for estimation # Initialize models for stacking clf1 = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf2 = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf3 = KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf4 = KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf5 = KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf6 = KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf7 = KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf8 = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', max_iter=200, multi_class='ovr', verbose=0) clf9 = GaussianNB() clf10 = SVC(C=5.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.008, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101) clf11 = RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf12 = ExtraTreesClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf13 = GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=0.7, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_depth=6, init=None, random_state=101, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) clf14 = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=10, shuffle=True, verbose=0, epsilon=0.1, n_jobs=2, random_state=101, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False) clf15 = models.XGBoostClassifier(nthread=2, eta=.2, gamma=0, max_depth=6, min_child_weight=3, max_delta_step=0, subsample=0.7, colsample_bytree=0.7, silent=1, seed=101, l2_reg=1, l1_reg=0, n_estimators=450) clfs = [ clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14, clf15 ] # Construct stacked datasets train_blend, test_blend, train_probs, test_probs = utils.StackModels( train[features], test[features], train.signal.values, clfs, n_folds) # Construct data for uniform boosting columns = ['p%s ' % (i) for i in range(0, n_stack)] meta_train = pd.DataFrame( {columns[i]: train_probs[:, i] for i in range(0, n_stack)}) meta_test = pd.DataFrame( {columns[i]: test_probs[:, i] for i in range(0, n_stack)}) train_ugb = pd.concat([train, meta_train], axis=1) test_ugb = pd.concat([test, meta_test], axis=1) features_ugb = features + columns # features used for UGB training (original features + meta-features) # Initialize models for ensemble loss = BinFlatnessLossFunction(['mass'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0) clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101) clf_xgb = models.XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, max_delta_step=0, subsample=0.8, colsample_bytree=0.3, silent=1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100) clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=101, verbose=0, warm_start=False, class_weight=None) # Train models print("Training a Uniform Gradient Boosting model") clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal']) preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:, 1] print("Training a XGBoost model") clf_xgb.fit(train_blend, train['signal']) preds_xgb = clf_xgb.predict_proba(test_blend) print("Training a Random Forest model") clf_rf.fit(train_blend, train['signal']) preds_rf = clf_rf.predict_proba(test_blend)[:, 1] # Compute ensemble predictions preds = 0.3 * (preds_xgb**(0.65)) * (preds_rf**(0.35)) + 0.7 * preds_ugb return preds