Beispiel #1
0
gbr1 = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, 
        min_samples_split=1, min_samples_leaf=3, max_depth=15, init=None, 
        random_state=None, max_features=None, alpha=0.9, verbose=2)

# Select Greedy features from Xt_orig1
Xt_orig1 = Xt_orig1[:, [0, 1, 3, 5, 7, 9, 12, 13, 14, 16, 19, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36]]
#Xt_orig1 = Xt_orig1[:,[0, 1, 3, 5, 8, 9, 11, 12, 13, 16, 18, 21, 22, 23, 25, 26, 27, 29, 30, 31, 32, 34, 35, 36, 37, 38]]

print "encoding original data..."
Xt_orig, keymap = utils.OneHotEncoder(Xt_orig1)
train_orig = Xt_orig[:num_train]
test_orig = Xt_orig[num_train:]

#print "use ACTION for data generation..."
y, X = data_io.load_data_pd('../data/train_orig.csv', use_labels=True)
_, X_test = data_io.load_data_pd('../data/test_orig.csv', use_labels=False)

oldTest = loadData('../data/test_orig.csv')
oldTrain= loadData('../data/train_orig.csv')
oldTrain = oldTrain[1:,1:]
oldTest = oldTest[1:,1:]

# # print "Grouping Data"
# # xd2 = utils.group_data2(oldTrain[:,:-1], degree=2)  #skip last column 
# # xd3 = utils.group_data2(oldTrain[:,:-1], degree=3)  #skip last column

# # xtestd2 = utils.group_data2(oldTest[:,:-1], degree=2) 
# # xtestd3 = utils.group_data2(oldTest[:,:-1], degree=3)    

# # X_old = np.hstack((oldTrain, xd2, xd3))    
Beispiel #2
0
def doCV():
    SEED = 42
    rnd = np.random.RandomState(SEED)

    model_lr = linear_model.LogisticRegression(C=3)
    model_rf = ensemble.RandomForestClassifier(
        n_estimators=10, min_samples_split=10, compute_importances=False, n_jobs=2, random_state=rnd, verbose=2
    )

    print "loading data for random forest..."
    y, X = data_io.load_data_pd("train_orig.csv", use_labels=True)
    _, X_test = data_io.load_data_pd("test_orig.csv", use_labels=False)

    xtrain = getRFX(X)
    xtest = getRFX_test(X_test)
    xtrain = xtrain[:, 1:]
    xtest = xtest[:, 1:]

    xtrain.dump("num_train.dat")
    xtest.dump("num_test.dat")
    print "dumped..!"
    print "loading data for logistic regression..."
    ysp, Xsp = data_io.load_data("train_orig.csv")
    y_testsp, X_testsp = data_io.load_data("test_orig.csv", use_labels=False)
    # === one-hot encoding === #
    # we want to encode the category IDs encountered both in
    # the training and the test set, so we fit the encoder on both
    encoder = preprocessing.OneHotEncoder()
    # print Xsp.shape, X_testsp.shape
    encoder.fit(np.vstack((Xsp, X_testsp)))
    Xsp = encoder.transform(Xsp)  # Returns a sparse matrix (see numpy.sparse)
    X_testsp = encoder.transform(X_testsp)

    print "starting cross validation..."
    nFeatures = X.shape[0]
    niter = 10
    cv = cross_validation.ShuffleSplit(nFeatures, n_iter=niter, test_size=0.2, random_state=rnd)
    mean_auc = 0.0
    i = 0
    for train, test in cv:
        xtrain = X.ix[train]
        ytrain = y[train]
        xtest = X.ix[test]
        ytest = y[test]

        xtrain_sp = Xsp[train]
        xtest_sp = X_testsp[test]
        ytrainsp = ysp[train]

        xtrain = getRFX(xtrain)
        xtest = getRFX_test(xtest)
        xtrain = xtrain[:, 1:]
        xtest = xtest[:, 1:]

        print "fitting random forest...."
        model_rf.fit(xtrain, ytrain)
        preds_rf = model_rf.predict_proba(xtest)[:, 1]

        print "fitting logistic regression..."
        model_lr.fit(xtrain_sp, ytrainsp)
        preds_lr = model_lr.predict_proba(xtest_sp)[:, 1]

        preds = [np.mean(x) for x in zip(preds_rf, preds_lr)]

        fpr, tpr, _ = metrics.roc_curve(ytest, preds)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, niter, roc_auc)
        mean_auc += roc_auc
        i += 1
    print "Mean AUC: ", mean_auc / niter