Esempio n. 1
0
import MLScripts.Metrics as metrics
import MLScripts.Helpers as helper
import csv

train_frame = dc.dfCleanData(dc.loadTrainData(desc=False))
train_data = dc.convertPandasDataFrameToNumpyArray(train_frame)
test_frame = dc.dfCleanDataTest(dc.loadTestData())
test_data = dc.convertPandasDataFrameToNumpyArray(test_frame)
dc.describeDataframe(test_frame)

decision_tree = tree.DecisionTreeClassifier(max_depth=4)
train_x = train_data[:, 1:]
train_y = train_data[:, 0]

decision_tree.fit(train_x, train_y)
cv_score = metrics.crossValidationScore(decision_tree, train_x, train_y, cvCount=5)

xTrain, xTest, yTrain, yTest = metrics.traintestSplit(train_x, train_y, randomState=1)
cv_tree = tree.DecisionTreeClassifier(max_depth=4)
cv_tree.fit(xTrain,yTrain)
y_predict = cv_tree.predict(xTest)
ta = metrics.trainingAccuracy(yTest, y_predict)
rmse = metrics.rmse(yTest, y_predict)
nrmse = metrics.nrmse(yTest, y_predict)

predictors = dc.getColNames(train_frame)[1:]
kfoldAccuracy = metrics.measureKFoldAccuracy(train_frame, decision_tree, predictors, outputClass="Survived", outputClause="Survived", kFolds=10)

print("Max Cross Validation Score : ", cv_score.max(), "\nAverage Cross Validation Score : ", cv_score.mean(),
  "\nExtraTreeCLassifier Score : ", decision_tree.score(xTrain, yTrain),
  "\nTraining Accuracy : ", ta,
#randomForest.fit(trainX, trainY)

testX = testData[:, 1:]
#resultsY = randomForest.predict(testX)
"""
Cross Validation
"""
# Cross Validation
cvCount = 10
crossvalidation = cross_val_score(randomForest,
                                  trainX,
                                  trainY,
                                  cv=cvCount,
                                  scoring="accuracy")

xTrain, xTest, yTrain, yTest = metric.traintestSplit(trainX, trainY)
"""
accuracyScores = []
depths = range(1, 26)
for depth in depths:
    icvTree = xgb.XGBClassifier(max_depth=depth, n_estimators=100, nthread=4, seed=0)
    icvTree.fit(xTrain, yTrain)
    iyPreds = icvTree.predict(xTest)
    accuracyScores.append(metric.trainingAccuracy(yTest, iyPreds))
sns.plt.plot(depths, accuracyScores, alpha=0.7)
sns.plt.xlabel("Depth Values")
sns.plt.ylabel("Accuracy Scores")
sns.plt.show()
"""

crossvalidationTree = xgb.XGBClassifier(n_estimators=100,
Esempio n. 3
0
testData = dataclean.convertPandasDataFrameToNumpyArray(testFrame)

trainX = trainData[:, 1:]
trainY = trainData[:, 0]

testX = testData[:, 1:]
"""
Cross Validation
"""
crossvalidationTree = ensemble.RandomForestRegressor(n_estimators=500,
                                                     n_jobs=4,
                                                     random_state=1)
cvCount = 10
crossvalidation = Metrics.crossValidationScore(
    ensemble.GradientBoostingRegressor(random_state=1),
    trainX,
    trainY,
    cvCount=cvCount)

xTrain, xTest, yTrain, yTest = Metrics.traintestSplit(trainX,
                                                      trainY,
                                                      randomState=1)
"""
#{'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.01

if __name__ == "__main__":
    params = {"max_depth" : [3,4,5,6,7,8], "n_estimators" : [100, 200, 300, 400], "learning_rate" : [0.01, 0.05, 0.1, 0.2, 0.5, 1]}
    clf = GridSearchCV(crossvalidationTree, params, verbose=1, n_jobs=2, cv=10)
    clf.fit(trainX, trainY)

    print("GridSearch : \n", "Best Estimator : ", clf.best_estimator_,
Esempio n. 4
0
import csv
import BikeSharingDemand.Combine.Model as model
import MLScripts.Metrics as Metrics

trainFrame = model.cleanTrainset(model.loadTrainData())
trainData = model.convertPandasDataFrameToNumpyArray(trainFrame)

trainX = trainData[:, 3:]
trainYCasReg = trainData[:, 0:2]  # [casual, registered]

xTrain, xTest, yTrain, yTest = Metrics.traintestSplit(trainX, trainYCasReg)

#xgboost = model.selectXGBoost()
#xgboost2 = model.selectXGBoost()

boostCount = 8
xgboosts = [model.selectXGBoost() for _ in range(boostCount)]

combinedRegressor = model.Combiner(xgboosts)
combinedRegressor.fit(xTrain, yTrain)

yPred = combinedRegressor.predict(xTest)

y = []
for i, x in enumerate(yTest):
    y.append(x[0] + x[1])

rmsle = Metrics.rmsle2(y, yPred)

print("RMSLE Score : ", rmsle)
"""
Esempio n. 5
0
trainFrame = dataclean.cleanDataset(dataclean.loadTrainData())
trainData = dataclean.convertPandasDataFrameToNumpyArray(trainFrame)

testFrame = dataclean.cleanDataset(dataclean.loadTestData(), True)
testData = dataclean.convertPandasDataFrameToNumpyArray(testFrame)

trainX = trainData[:, 1:]
trainY = trainData[:, 0]

testX = testData[:, 1:]

"""
Cross Validation
"""
xTrain, xTest, yTrain, yTest = Metrics.traintestSplit(trainX, trainY, randomState=1)

xgbtree = xgb.XGBRegressor(n_estimators=500, learning_rate=0.01, max_depth=10, seed=1, nthread=4)
gbtree = ensemble.GradientBoostingRegressor(n_estimators=400, learning_rate=0.01, max_depth=6, random_state=1, presort=True)
randomforest = ensemble.RandomForestRegressor(n_estimators=500, n_jobs=4, random_state=1)
decisionTree = tree.DecisionTreeRegressor(presort=True)

xgbtree.fit(trainX, trainY)
gbtree.fit(trainX, trainY)
randomforest.fit(trainX, trainY)
decisionTree.fit(trainX, trainY)

xgbPredict = xgbtree.predict(testX)
gbPredict = gbtree.predict(testX)
randomforestPredict = randomforest.predict(testX)
decisionTreePredict = decisionTree.predict(testX)
Esempio n. 6
0
                                           n_estimators=100, learning_rate=0.1, random_state=600)"""

trainX = trainData[:, 2:]
trainY = trainData[:, 1]

#randomForest.fit(trainX, trainY)

testX = testData[:, 1:]
#resultsY = randomForest.predict(testX)
"""
Cross Validation
"""
# Cross Validation
cvCount = 10
crossvalidation = Metrics.crossValidationScore(randomForest,
                                               trainX,
                                               trainY,
                                               cvCount=cvCount)

xTrain, xTest, yTrain, yTest = Metrics.traintestSplit(trainX,
                                                      trainY,
                                                      randomState=1)
"""
accuracyScores = []
rngs = metrics.frange(0.01, 0.2)
for rng in rngs:
    icvTree = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=rng, max_depth=4, random_state=1,
                                                  warm_start=True)
    icvTree.fit(xTrain, yTrain)
    iyPreds = icvTree.predict(xTest)
    accuracyScores.append(metrics.trainingAccuracy(yTest, iyPreds))
sns.plt.plot(rngs, accuracyScores, alpha=0.7)