def model(X, Y, X_test, X_dev): xgdmat = xgb.DMatrix(X, Y) our_params = { 'eta': 0.001, 'seed': 40, 'subsample': 0.3, 'colsample_bytree': 0.5, 'gamma': 0, 'nthread': 4, 'scale_pos_weight': 1, 'reg_alpha': 0.002, 'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 9, 'cv': 20 } final_gb = xgb.train(our_params, xgdmat, num_boost_round=6000) y_pred = final_gb.predict(xgdmat) print('AuC score on training data:', roc_auc_score(Y_train, y_pred)) y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 print(accuracy_score(y_pred, Y_train)) testdmat = xgb.DMatrix(X_test) y_pred = final_gb.predict(testdmat) devdmat = xgb.DMatrix(X_dev) y_preddev = final_gb.predict(devdmat) write_to_csv.writeToCSV('predBoost.csv', y_pred) return y_preddev
def LogisticRegression(X, Y, XDev, YDev, XTest, YTest, lmda, learningRate, maxIter=100): W = SgdLogistic(X, Y, maxIter, learningRate, lmda) nCorrect = 0. nIncorrect = 0. pTr = [] for i in range(len(Y)): y_hat = predict(W, X[i, ]) pTr.append(y_hat) if y_hat >= 0.5: y_hat = 1 else: y_hat = -1 # y_hat = np.sign(X[i,].dot(W)) if y_hat == Y[i]: nCorrect += 1 else: nIncorrect += 1 trainAccuracy = nCorrect / (nCorrect + nIncorrect) nCorrect = 0. nIncorrect = 0. pDev = [] for i in range(len(YDev)): y_hat = predict(W, XDev[i, ]) pDev.append(y_hat) if y_hat >= 0.5: y_hat = 1 else: y_hat = -1 # y_hat = np.sign(XDev[i,].dot(W)) if y_hat == YDev[i]: nCorrect += 1 else: nIncorrect += 1 devAccuracy = nCorrect / (nCorrect + nIncorrect) prob = [] nCorrect = 0. nIncorrect = 0. for i in range(len(YTest)): y_hat = predict(W, XTest[i, ]) prob.append(y_hat) if y_hat >= 0.5: y_hat = 1 else: y_hat = -1 # y_hat = np.sign(XTest[i,].dot(W)) if y_hat == YTest[i]: nCorrect += 1 else: nIncorrect += 1 testAccuracy = nCorrect / (nCorrect + nIncorrect) write_to_csv.writeToCSV('predictions.csv', prob) false_positive_rate, true_positive_rate, _ = roc_curve(Y_train, pTr) roc_auc = auc(false_positive_rate, true_positive_rate) print "ROC _ Train -- ", roc_auc false_positive_rate, true_positive_rate, _ = roc_curve(Y_dev, pDev) roc_auc = auc(false_positive_rate, true_positive_rate) print "ROC _ Dev -- ", roc_auc return trainAccuracy, devAccuracy, testAccuracy
from sklearn.ensemble import RandomForestClassifier from sklearn import model_selection import load_test_data import pre_process import write_to_csv X_train, Y_train, X_dev, Y_dev = pre_process.preprocessData('train.csv') X_test, Y_test = load_test_data.loadTestData('test.csv') seed = 7 num_trees = 100 max_features = 'auto' kfold = model_selection.KFold(n_splits=10, random_state=seed) model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) model.fit(X_train, Y_train) prob = model.predict_proba(X_test) y_pred = [] for x in prob: y_pred.append(x[0]) write_to_csv.writeToCSV('predRF.csv', y_pred)
# Bagged Decision Trees for Classification from sklearn import model_selection from sklearn.ensemble import BaggingClassifier from sklearn.metrics import precision_recall_curve from sklearn.metrics import roc_auc_score from sklearn.tree import DecisionTreeClassifier import load_test_data import pre_process import write_to_csv X_train, Y_train, X_dev, Y_dev = pre_process.preprocessData('train.csv') X_test, Y_test = load_test_data.loadTestData('test.csv') if __name__ == "__main__": seed = 1729 kfold = model_selection.KFold(n_splits=10, random_state=seed) cart = DecisionTreeClassifier() num_trees = 200 model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed) model.fit(X_train, Y_train) probs_tr = model.predict_proba(X_train) # precision, recall, thresholds = precision_recall_curve(Y_train, probs_tr[:, 1]) print('AuC score on training data:', roc_auc_score(Y_train, probs_tr[:, 1])) probs_test = model.predict_proba(X_test) # probs_test = model_selection.cross_val_predict(model, X_test, cv=kfold, method='predict_proba') # write_to_csv.writeToCSV('preds_bagg_cv.csv', probs_test[:, 1])
from sklearn import ensemble from sklearn.metrics import precision_recall_curve from sklearn.metrics import roc_auc_score import load_test_data import pre_process import write_to_csv X_train, Y_train, X_dev, Y_dev = pre_process.preprocessData('train.csv') X_test, Y_test = load_test_data.loadTestData('test.csv') if __name__ == "__main__": model = ensemble.GradientBoostingClassifier(learning_rate=0.01, max_depth=20) model.fit(X_train, Y_train) probs_tr = model.predict_proba(X_train) precision, recall, thresholds = precision_recall_curve( Y_train, probs_tr[:, 1]) print('AuC score on training data:', roc_auc_score(Y_train, probs_tr[:, 1])) p_test = model.predict_proba(X_test) write_to_csv.writeToCSV('preds_bagg.csv', p_test[:, 1])
from sklearn.metrics import precision_recall_curve from sklearn.metrics import roc_auc_score import load_test_data import pre_process import write_to_csv X_train, Y_train, X_dev, Y_dev = pre_process.preprocessData('train.csv') X_test, Y_test = load_test_data.loadTestData('test.csv') if __name__ == "__main__": model = ensemble.GradientBoostingClassifier() print "40 Fold CV Score: ", np.mean( cross_validation.cross_val_score(model, X_train, Y_train, cv=40, scoring='roc_auc')) model.fit(X_train, Y_train) probs = model.predict_proba(X_train) precision, recall, thresholds = precision_recall_curve( Y_train, probs[:, 1]) print 'AuC score on training data:', roc_auc_score(Y_train, probs[:, 1]) probs_test = model.predict_proba(X_test) write_to_csv.writeToCSV('preds_gb_40.csv', probs_test[:, 1])