Ejemplo n.º 1
0
test_data = dc.convertPandasDataFrameToNumpyArray(test_frame)
dc.describeDataframe(test_frame)

decision_tree = tree.DecisionTreeClassifier(max_depth=4)
train_x = train_data[:, 1:]
train_y = train_data[:, 0]

decision_tree.fit(train_x, train_y)
cv_score = metrics.crossValidationScore(decision_tree, train_x, train_y, cvCount=5)

xTrain, xTest, yTrain, yTest = metrics.traintestSplit(train_x, train_y, randomState=1)
cv_tree = tree.DecisionTreeClassifier(max_depth=4)
cv_tree.fit(xTrain,yTrain)
y_predict = cv_tree.predict(xTest)
ta = metrics.trainingAccuracy(yTest, y_predict)
rmse = metrics.rmse(yTest, y_predict)
nrmse = metrics.nrmse(yTest, y_predict)

predictors = dc.getColNames(train_frame)[1:]
kfoldAccuracy = metrics.measureKFoldAccuracy(train_frame, decision_tree, predictors, outputClass="Survived", outputClause="Survived", kFolds=10)

print("Max Cross Validation Score : ", cv_score.max(), "\nAverage Cross Validation Score : ", cv_score.mean(),
  "\nExtraTreeCLassifier Score : ", decision_tree.score(xTrain, yTrain),
  "\nTraining Accuracy : ", ta,
  "\nRoot Mean Squared Error : ", rmse, "\nNormalized RMSE : ", nrmse,
  "\nKFold Accuracy : ", kfoldAccuracy)

featureNames = dc.getColNames(train_frame)[1:]
helper.printFeatureImportances(featureNames, decision_tree.feature_importances_)

final_y_pred = decision_tree.predict(test_data[:, 1:])
Ejemplo n.º 2
0
if __name__ == "__main__":
    params = {"max_depth" : [3,4,5,6,7,8], "n_estimators" : [100, 200, 300, 400], "learning_rate" : [0.01, 0.05, 0.1, 0.2, 0.5, 1]}
    clf = GridSearchCV(crossvalidationTree, params, verbose=1, n_jobs=2, cv=10)
    clf.fit(trainX, trainY)

    print("GridSearch : \n", "Best Estimator : ", clf.best_estimator_,
        "\nBest Params : ", clf.best_params_, "\nBest Score", clf.best_score_)
"""

crossvalidationTree.fit(xTrain, yTrain)

yPredict = crossvalidationTree.predict(xTest)

#trainingAccuracy = metrics.trainingAccuracy(yTest, yPredict)
rmse = Metrics.rmse(yTest, yPredict)
nrmse = Metrics.nrmse(yTest, yPredict)

for i, x in enumerate(yPredict):
    if yPredict[i] < 0:
        #print("Yactucal : ", yTest[i], " Ypredict : ", yPredict[i])
        yPredict[i] = -yPredict[i]

logloss = Metrics.rmsle(yTest, yPredict)

print(
    "Max Cross Validation Score : ",
    crossvalidation.max(),
    "\nAverage Cross Validation Score : ",
    crossvalidation.mean(),
    #"\nGradient Boosting Forest Score : ", crossvalidationTree.score(xTrain, yTrain),
Ejemplo n.º 3
0
if __name__ == "__main__":
    params = {"max_depth" : [3,4,5,6,7,8], "n_estimators" : [50, 100, 150, 200, 250, 300, 350, 400], "learning_rate" : [0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.5, 1], "gamma" : [0.1, 0.2, 0.5, 1]}
    clf = GridSearchCV(crossvalidationTree, params, verbose=1, n_jobs=4, cv=10)
    clf.fit(xTrain, yTrain)

    print("GridSearch : \n", "Best Estimator : ", clf.best_estimator_,
        "\nBest Params : ", clf.best_params_, "\nBest Score", clf.best_score_)
"""

evalSet = [(xTrain, yTrain), (xTest, yTest)]
crossvalidationTree.fit(xTrain, yTrain, eval_set=evalSet, eval_metric="auc")

predictedY = crossvalidationTree.predict(xTest)

trainingAccuracy = metric.trainingAccuracy(yTest, predictedY)
rmse = metric.rmse(yTest, predictedY)
nrmse = metric.nrmse(yTest, predictedY)

predictors = dataclean.getFeatureNames()[2:]
kfoldAccuracy = metric.measureKFoldAccuracy(trainFrame,
                                            crossvalidationTree,
                                            predictors,
                                            outputClass="Survived",
                                            outputClause="Survived",
                                            kFolds=10)

print("Max Cross Validation Score : ",
      crossvalidation.max(), "\nAverage Cross Validation Score : ",
      crossvalidation.mean(), "\nTraining Accuracy : ", trainingAccuracy,
      "\nRoot Mean Squared Error : ", rmse, "\nNormalized RMSE : ", nrmse,
      "\nKFold Accuracy : ", kfoldAccuracy)
Ejemplo n.º 4
0
if __name__ == "__main__":
    params = {"max_depth" : [2, 3, 4, 5, 6, 7, 8], "max_features" : [None, 2, 3, 4, 5, 6, 7, 8, 9, "auto", "log2", ], "presort" : [True, False]}
    clf = GridSearchCV(decisionTree, params, verbose=1, n_jobs=4, cv=10)
    clf.fit(xTrain, yTrain)

    print("GridSearch : \n", "Best Estimator : ", clf.best_estimator_,
        "\nBest Params : ", clf.best_params_, "\nBest Score", clf.best_score_)

"""

decisionTree.fit(xTrain, yTrain)

predictedY = decisionTree.predict(xTest)

trainingAccuracy = Metrics.trainingAccuracy(yTest, predictedY)
rmse = Metrics.rmse(yTest, predictedY)
nrmse = Metrics.nrmse(yTest, predictedY)

predictors = dataclean.getFeatureNames()[2:]
kfoldAccuracy = Metrics.measureKFoldAccuracy(trainFrame,
                                             decisionTree,
                                             predictors,
                                             outputClass="Survived",
                                             outputClause="Survived",
                                             kFolds=10)

print("Max Cross Validation Score : ",
      crossvalidation.max(), "\nAverage Cross Validation Score : ",
      crossvalidation.mean(), "\nExtraTreeCLassifier Score : ",
      decisionTree.score(xTrain, yTrain), "\nTraining Accuracy : ",
      trainingAccuracy, "\nRoot Mean Squared Error : ", rmse,