def model(X, Y, X_test, X_dev):
    xgdmat = xgb.DMatrix(X, Y)

    our_params = {
        'eta': 0.001,
        'seed': 40,
        'subsample': 0.3,
        'colsample_bytree': 0.5,
        'gamma': 0,
        'nthread': 4,
        'scale_pos_weight': 1,
        'reg_alpha': 0.002,
        'objective': 'binary:logistic',
        'max_depth': 3,
        'min_child_weight': 9,
        'cv': 20
    }

    final_gb = xgb.train(our_params, xgdmat, num_boost_round=6000)
    y_pred = final_gb.predict(xgdmat)

    print('AuC score on training data:', roc_auc_score(Y_train, y_pred))
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0

    print(accuracy_score(y_pred, Y_train))

    testdmat = xgb.DMatrix(X_test)
    y_pred = final_gb.predict(testdmat)

    devdmat = xgb.DMatrix(X_dev)
    y_preddev = final_gb.predict(devdmat)

    write_to_csv.writeToCSV('predBoost.csv', y_pred)
    return y_preddev
def LogisticRegression(X,
                       Y,
                       XDev,
                       YDev,
                       XTest,
                       YTest,
                       lmda,
                       learningRate,
                       maxIter=100):
    W = SgdLogistic(X, Y, maxIter, learningRate, lmda)
    nCorrect = 0.
    nIncorrect = 0.
    pTr = []
    for i in range(len(Y)):
        y_hat = predict(W, X[i, ])
        pTr.append(y_hat)
        if y_hat >= 0.5:
            y_hat = 1
        else:
            y_hat = -1
        # y_hat = np.sign(X[i,].dot(W))

        if y_hat == Y[i]:
            nCorrect += 1
        else:
            nIncorrect += 1
    trainAccuracy = nCorrect / (nCorrect + nIncorrect)

    nCorrect = 0.
    nIncorrect = 0.
    pDev = []
    for i in range(len(YDev)):
        y_hat = predict(W, XDev[i, ])
        pDev.append(y_hat)
        if y_hat >= 0.5:
            y_hat = 1
        else:
            y_hat = -1
        # y_hat = np.sign(XDev[i,].dot(W))

        if y_hat == YDev[i]:
            nCorrect += 1
        else:
            nIncorrect += 1
    devAccuracy = nCorrect / (nCorrect + nIncorrect)

    prob = []
    nCorrect = 0.
    nIncorrect = 0.
    for i in range(len(YTest)):
        y_hat = predict(W, XTest[i, ])
        prob.append(y_hat)
        if y_hat >= 0.5:
            y_hat = 1
        else:
            y_hat = -1
        # y_hat = np.sign(XTest[i,].dot(W))

        if y_hat == YTest[i]:
            nCorrect += 1
        else:
            nIncorrect += 1

    testAccuracy = nCorrect / (nCorrect + nIncorrect)

    write_to_csv.writeToCSV('predictions.csv', prob)

    false_positive_rate, true_positive_rate, _ = roc_curve(Y_train, pTr)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print "ROC _ Train  -- ", roc_auc

    false_positive_rate, true_positive_rate, _ = roc_curve(Y_dev, pDev)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print "ROC _ Dev  -- ", roc_auc

    return trainAccuracy, devAccuracy, testAccuracy
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
import load_test_data
import pre_process
import write_to_csv

X_train, Y_train, X_dev, Y_dev = pre_process.preprocessData('train.csv')
X_test, Y_test = load_test_data.loadTestData('test.csv')

seed = 7
num_trees = 100
max_features = 'auto'
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees,
                               max_features=max_features)

model.fit(X_train, Y_train)
prob = model.predict_proba(X_test)
y_pred = []
for x in prob:
    y_pred.append(x[0])

write_to_csv.writeToCSV('predRF.csv', y_pred)
# Bagged Decision Trees for Classification
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier

import load_test_data
import pre_process
import write_to_csv

X_train, Y_train, X_dev, Y_dev = pre_process.preprocessData('train.csv')
X_test, Y_test = load_test_data.loadTestData('test.csv')

if __name__ == "__main__":
    seed = 1729
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cart = DecisionTreeClassifier()
    num_trees = 200
    model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
    model.fit(X_train, Y_train)
    probs_tr = model.predict_proba(X_train)
    #
    precision, recall, thresholds = precision_recall_curve(Y_train, probs_tr[:, 1])
    print('AuC score on training data:', roc_auc_score(Y_train, probs_tr[:, 1]))

    probs_test = model.predict_proba(X_test)
    # probs_test = model_selection.cross_val_predict(model, X_test, cv=kfold, method='predict_proba')
    #
    write_to_csv.writeToCSV('preds_bagg_cv.csv', probs_test[:, 1])
from sklearn import ensemble
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score

import load_test_data
import pre_process
import write_to_csv

X_train, Y_train, X_dev, Y_dev = pre_process.preprocessData('train.csv')
X_test, Y_test = load_test_data.loadTestData('test.csv')

if __name__ == "__main__":
    model = ensemble.GradientBoostingClassifier(learning_rate=0.01,
                                                max_depth=20)
    model.fit(X_train, Y_train)

    probs_tr = model.predict_proba(X_train)
    precision, recall, thresholds = precision_recall_curve(
        Y_train, probs_tr[:, 1])
    print('AuC score on training data:', roc_auc_score(Y_train, probs_tr[:,
                                                                         1]))

    p_test = model.predict_proba(X_test)

    write_to_csv.writeToCSV('preds_bagg.csv', p_test[:, 1])
Esempio n. 6
0
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score

import load_test_data
import pre_process
import write_to_csv

X_train, Y_train, X_dev, Y_dev = pre_process.preprocessData('train.csv')
X_test, Y_test = load_test_data.loadTestData('test.csv')

if __name__ == "__main__":
    model = ensemble.GradientBoostingClassifier()

    print "40 Fold CV Score: ",
    np.mean(
        cross_validation.cross_val_score(model,
                                         X_train,
                                         Y_train,
                                         cv=40,
                                         scoring='roc_auc'))
    model.fit(X_train, Y_train)
    probs = model.predict_proba(X_train)

    precision, recall, thresholds = precision_recall_curve(
        Y_train, probs[:, 1])
    print 'AuC score on training data:', roc_auc_score(Y_train, probs[:, 1])

    probs_test = model.predict_proba(X_test)

    write_to_csv.writeToCSV('preds_gb_40.csv', probs_test[:, 1])