def test_gradient_boosting(n_samples=1000): # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_variables = ['column0'] n_estimators = 20 loss1 = SimpleKnnLossFunction(uniform_variables) # loss2 = PairwiseKnnLossFunction(uniform_variables, knn=10) loss3 = BinomialDevianceLossFunction() # loss4 = RandomKnnLossFunction(uniform_variables, samples * 2, knn=5, knn_factor=3) # loss5 = DistanceBasedKnnFunction(uniform_variables, knn=10, distance_dependence=lambda r: numpy.exp(-0.1 * r)) loss6bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7bin = BinFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5) loss7knn = KnnFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) # loss8 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=1) # loss9 = NewFlatnessLossFunction(uniform_variables, ada_coefficient=0.5, uniform_label=[0, 1]) for loss in [loss1, loss3, loss6bin, loss7bin, loss6knn, loss7knn]: result = uGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=n_estimators, train_variables=None) \ .fit(trainX[:n_samples], trainY[:n_samples]).score(testX, testY) assert result >= 0.7, "The quality is too poor: %.3f" % result
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = CompositeLossFunction() loss4 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss)
def flatnessloss(X, y, test): features = list(X.columns) features.remove('mass') loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) clf = UGradientBoostingClassifier(loss=loss, n_estimators=300, subsample=0.7, max_depth=9, min_samples_leaf=8, learning_rate=0.1, train_features=features, random_state=11) arr = np.random.permutation(X.shape[0]) X = X.ix[arr, ] y = y[arr] skf = cross_validation.StratifiedKFold(y, n_folds=7) blend_train = np.zeros(X.shape[0]) prediction = [] blend_test_j = np.zeros((test.shape[0], len(skf))) for i, (train_index, cv_index) in enumerate(skf): print "Fold:", i X_train = X.ix[train_index, ] y_train = y[train_index] X_cv = X.ix[cv_index, ] #y_cv = y[cv_index] clf.fit(X_train, y_train) blend_train[cv_index] = clf.predict_proba(X_cv)[:, 1] blend_test_j[:, i] = clf.predict_proba(test)[:, 1] prediction = blend_test_j.mean(1) return prediction
for i in range(100): print 'shuffling' train = shuffle(train) print("Train a Random Forest model") rf1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", max_depth=10, max_features=6, min_samples_leaf=2) rf1.fit(train[features], train["signal"]) print("Train a UGradientBoostingClassifier") loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0) rf = UGradientBoostingClassifier(loss=loss, n_estimators=200, max_depth=6, learning_rate=0.15, train_features=features, subsample=0.7, random_state=369) rf.fit(train[features + ['mass']], train['signal']) print("Train a XGBoost model") params = { "objective": "binary:logistic", "learning_rate": 0.2, "max_depth": 6,
] features = list(f for f in df.columns if f not in features_out) print("Split train/test") train, test = train_test_split(df, test_size=0.33) X_train = train[features] y_train = train['signal'] X_val = test[features] y_val = test['signal'] loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0, fl_coefficient=15, power=2) ugbc = UGradientBoostingClassifier(loss=loss, n_estimators=550, max_depth=6, learning_rate=0.15, train_features=features, subsample=0.7, random_state=123) ugbc.fit(train[features + ['mass']], train['signal']) pred_raw = ugbc.predict(test[features]) #print(pred_raw) pred = pd.DataFrame(data={'signal': pred_raw}) #print(pred.head(5)) #accuracy_fn(pred,y_val)
def Model1(): # Model 1 is an ensemble of XGBoost, Random Forest and Uniform Gradient Boosting Classifiers # which are trained using the stacked data model = 1 # set the model number for feature engineering n_folds = 3 # set the number of folders for generating meta-features n_stack = 15 # number of models used for stacking train, test, features = utils.LoadData( model) # load data and obtain the list of features for estimation # Initialize models for stacking clf1 = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf2 = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf3 = KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf4 = KNeighborsClassifier(n_neighbors=40, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf5 = KNeighborsClassifier(n_neighbors=80, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf6 = KNeighborsClassifier(n_neighbors=160, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf7 = KNeighborsClassifier(n_neighbors=320, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) clf8 = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=5.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=101, solver='lbfgs', max_iter=200, multi_class='ovr', verbose=0) clf9 = GaussianNB() clf10 = SVC(C=5.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.008, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=101) clf11 = RandomForestClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf12 = ExtraTreesClassifier(n_estimators=250, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=0.7, max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=2, random_state=101, verbose=0, warm_start=False, class_weight=None) clf13 = GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=450, subsample=0.7, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_depth=6, init=None, random_state=101, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) clf14 = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=10, shuffle=True, verbose=0, epsilon=0.1, n_jobs=2, random_state=101, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False) clf15 = models.XGBoostClassifier(nthread=2, eta=.2, gamma=0, max_depth=6, min_child_weight=3, max_delta_step=0, subsample=0.7, colsample_bytree=0.7, silent=1, seed=101, l2_reg=1, l1_reg=0, n_estimators=450) clfs = [ clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, clf11, clf12, clf13, clf14, clf15 ] # Construct stacked datasets train_blend, test_blend, train_probs, test_probs = utils.StackModels( train[features], test[features], train.signal.values, clfs, n_folds) # Construct data for uniform boosting columns = ['p%s ' % (i) for i in range(0, n_stack)] meta_train = pd.DataFrame( {columns[i]: train_probs[:, i] for i in range(0, n_stack)}) meta_test = pd.DataFrame( {columns[i]: test_probs[:, i] for i in range(0, n_stack)}) train_ugb = pd.concat([train, meta_train], axis=1) test_ugb = pd.concat([test, meta_test], axis=1) features_ugb = features + columns # features used for UGB training (original features + meta-features) # Initialize models for ensemble loss = BinFlatnessLossFunction(['mass'], n_bins=20, power=1, fl_coefficient=3, uniform_label=0) clf_ugb = UGradientBoostingClassifier(loss=loss, n_estimators=275, max_depth=11, min_samples_leaf=3, learning_rate=0.03, train_features=features_ugb, subsample=0.85, random_state=101) clf_xgb = models.XGBoostClassifier(nthread=6, eta=.0225, gamma=1.225, max_depth=11, min_child_weight=10, max_delta_step=0, subsample=0.8, colsample_bytree=0.3, silent=1, seed=101, l2_reg=1, l1_reg=0, n_estimators=1100) clf_rf = RandomForestClassifier(n_estimators=375, criterion='gini', max_depth=10, min_samples_split=6, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=0.6, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=101, verbose=0, warm_start=False, class_weight=None) # Train models print("Training a Uniform Gradient Boosting model") clf_ugb.fit(train_ugb[features_ugb + ['mass']], train_ugb['signal']) preds_ugb = clf_ugb.predict_proba(test_ugb[features_ugb])[:, 1] print("Training a XGBoost model") clf_xgb.fit(train_blend, train['signal']) preds_xgb = clf_xgb.predict_proba(test_blend) print("Training a Random Forest model") clf_rf.fit(train_blend, train['signal']) preds_rf = clf_rf.predict_proba(test_blend)[:, 1] # Compute ensemble predictions preds = 0.3 * (preds_xgb**(0.65)) * (preds_rf**(0.35)) + 0.7 * preds_ugb return preds
def stacked_models(train, features, test, in_sample=True): """ Build stacked generalization models, set in_sample to False to predict on test set. """ if in_sample: np.random.seed(1) new_indices = np.asarray(train.index.copy()) np.random.shuffle(new_indices) train = train.iloc[new_indices].reset_index(drop=True).copy() # not used in CV testing.. del test cutoff = int(new_indices.shape[0] * 0.75) X_dev = train[:cutoff].reset_index(drop=True).copy() Y_dev = train[:cutoff]['signal'].reset_index(drop=True).copy() X_test = train[cutoff:][ train[cutoff:]['min_ANNmuon'] > 0.4].reset_index(drop=True).copy() Y_test = train[cutoff:][ train[cutoff:]['min_ANNmuon'] > 0.4]['signal'].reset_index( drop=True).copy() else: np.random.seed(1) new_indices = np.asarray(train.index.copy()) np.random.shuffle(new_indices) train = train.iloc[new_indices].reset_index(drop=True).copy() X_dev = train.reset_index(drop=True).copy() Y_dev = train['signal'].reset_index(drop=True).copy() X_test = test.reset_index(drop=True).copy() Y_test = None n_folds = 5 # put ur parameter tuned CLFs in this list. clfs = [ RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1), RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=20, n_jobs=-1, max_depth=6), ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1), ExtraTreesClassifier(n_estimators=200, criterion='entropy', random_state=50, n_jobs=-1, max_depth=6), Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]), UGradientBoostingClassifier(loss=BinFlatnessLossFunction( ['mass'], n_bins=15, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=KnnFlatnessLossFunction( ['mass'], n_neighbours=30, uniform_label=0), n_estimators=150, subsample=0.1, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=BinFlatnessLossFunction( ['mass'], n_bins=15, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), UGradientBoostingClassifier(loss=KnnFlatnessLossFunction( ['mass'], n_neighbours=30, uniform_label=0), n_estimators=100, subsample=0.8, max_depth=6, min_samples_leaf=10, learning_rate=0.1, train_features=features, random_state=11), XGBoostClassifier(eval_metric='auc', objective='binary:logistic', num_class=2, nthread=4, silent=1, colsample_bytree=0.6, eta=0.005, max_depth=6, min_child_weight=13, seed=1337, subsample=0.7), NN1(len(features)), NN2(len(features)), NN3(len(features)), NN4(len(features)) ] skf = list(StratifiedKFold(Y_dev, n_folds)) # Number of training data x Number of classifiers blend_train = np.zeros((X_dev.shape[0], len(clfs))) # Number of testing data x Number of classifiers blend_test = np.zeros((X_test.shape[0], len(clfs))) print 'X_test.shape = %s' % (str(X_test.shape)) print 'blend_train.shape = %s' % (str(blend_train.shape)) print 'blend_test.shape = %s' % (str(blend_test.shape)) # For each classifier, we train the number of fold times (=len(skf)) for j, clf in enumerate(clfs): print 'Training classifier [%s]' % (j) # Number of testing data x Number of folds , we will take the mean of # the predictions later blend_test_j = np.zeros((X_test.shape[0], len(skf))) for i, (train_index, cv_index) in enumerate(skf): print 'Fold [%s]' % (i) # This is the training and validation set X_train = X_dev.iloc[train_index].copy() Y_train = Y_dev.iloc[train_index].copy() X_cv = X_dev.iloc[cv_index].copy() Y_cv = Y_dev.iloc[cv_index].copy() # handle the case of hep.ml stuff if type(clf) == type(UGradientBoostingClassifier()): clf.fit(X_train[features + ['mass']], Y_train.values.astype(np.int32)) else: clf.fit(X_train[features], Y_train.values.astype(np.int32)) # This output will be the basis for our blended classifier to train against, # which is also the output of our classifiers blend_train[cv_index, j] = clf.predict_proba(X_cv[features])[:, 1] blend_test_j[:, i] = clf.predict_proba(X_test[features])[:, 1] # Take the mean of the predictions of the cross validation set blend_test[:, j] = blend_test_j.mean(1) print 'Y_dev.shape = %s' % (Y_dev.shape) # blend with LR... bclf = LogisticRegression() bclf.fit(blend_train, Y_dev) bclf2 = GradientBoostingClassifier(n_estimators=150, learning_rate=0.02, max_depth=4, subsample=0.9, verbose=3, random_state=1337) bclf2.fit(blend_train, Y_dev) bclf3 = NeuralNet( layers=[('input', layers.InputLayer), ('hidden', layers.DenseLayer), ('output', layers.DenseLayer)], # layer parameters: input_shape=(None, blend_train.shape[1]), hidden_num_units=blend_train.shape[1], output_nonlinearity=nonlinearities. softmax, # output layer uses identity function output_num_units=2, # 2 target values # optimization method: update=nesterov_momentum, update_learning_rate=0.01, update_momentum=0.9, regression= False, # flag to indicate we're dealing with regression problem max_epochs=53, # TRY 50 and 46 epochs! verbose=1, eval_size=0.10) bclf3.fit(blend_train.astype(np.float32), Y_dev.astype(np.int32)) bclf4 = AdaBoostClassifier(n_estimators=400, random_state=88) bclf4.fit(blend_train, Y_dev) # Predict now Y_test_predict = bclf.predict_proba(blend_test)[:, 1] Y_test_predict2 = bclf2.predict_proba(blend_test)[:, 1] Y_test_predict3 = bclf3.predict_proba(blend_test.astype(np.float32))[:, 1] Y_test_predict4 = bclf4.predict_proba(blend_test)[:, 1] print 'Logit Coefs:', bclf.coef_ if in_sample: score = evaluation.roc_auc_truncated(Y_test, Y_test_predict) score2 = evaluation.roc_auc_truncated(Y_test, Y_test_predict2) score3 = evaluation.roc_auc_truncated(Y_test, blend_test.mean(1)) score4 = evaluation.roc_auc_truncated( Y_test, scipy_opt(blend_train, Y_dev, blend_test)) score5 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2) / 2.0) score6 = evaluation.roc_auc_truncated(Y_test, Y_test_predict3) score7 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3) / 3.0) score8 = evaluation.roc_auc_truncated(Y_test, Y_test_predict4) score9 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 3.0) score10 = evaluation.roc_auc_truncated( Y_test, (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0) print 'LR Score = %s' % (score) print 'GB Score = %s' % (score2) print 'MEAN Score = %s' % (score3) print 'Scipy Score = %s' % (score4) print 'LR + GB score = %s' % (score5) print 'ANN Score= %s' % (score6) print 'LR + GB + ANN Score = %s' % (score7) print 'ADA Score = %s' % (score8) print 'GB + ANN + ADA Score = %s' % (score9) print 'LR + GB + ANN + ADA Score = %s' % (score10) return blend_train, Y_dev, blend_test, Y_test # average of ADA, ANN and GBM. return (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0