Example #1
0
    def get_single_models(self):
        """
        function to get best single models
        :return:
        """
        self.single_models = {}
        self.rf = randomForest(self.params["rf"], self.X_train, self.y_train)
        self.ada = adaBoost(self.params["ada"], self.X_train, self.y_train)
        self.svc = svcClf(self.params["svc"], self.X_train, self.y_train)
        self.lr = logisticRegression(self.params["lr"], self.X_train,
                                     self.y_train)

        self.single_models[self.rf.get_model_name()] = self.rf.get_best_model()
        logging.info("Best parameters for random forest: {}\n".format(
            self.rf.best_params))

        self.single_models[
            self.ada.get_model_name()] = self.ada.get_best_model()
        logging.info("Best parameters for ada boost: {}\n".format(
            self.ada.best_params))

        self.single_models[
            self.svc.get_model_name()] = self.svc.get_best_model()
        logging.info("Best parameters for svc: {}\n".format(
            self.svc.best_params))

        self.single_models[self.lr.get_model_name()] = self.lr.get_best_model()
        logging.info("Best parameters for logistic regression: {}\n".format(
            self.lr.best_params))
def crossValidateRandomForest():
    f1Inputs, f1Labels, _ = read_libsvm_default('data/data_semeion/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm_default('data/data_semeion/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm_default('data/data_semeion/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm_default('data/data_semeion/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm_default('data/data_semeion/folds/fold5')
    allFoldInputArrays = [f1Inputs.toarray(), f2Inputs.toarray(),
                          f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray()]
    allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels]

    forestSizes = [10, 50, 100]

    bestForestSize = None
    bestAccuracy = 0

    counter = 1

    everyAccuracy = []

    for forestSize in forestSizes:
        allAccuracies = []
        for i in range(len(allFoldInputArrays)):
            allTrainData = []
            allTrainLabels = []
            for j in range(len(allFoldInputArrays)):
                if j != i:
                    allTrainData.extend(allFoldInputArrays[j])
                    allTrainLabels.extend(allFoldLabelArrays[j])

            print("Hyperparameters: forest size: " + str(forestSize))

            tempforest = randomForest(numFeatures, forestSize)
            tempforest.train(allTrainData, allTrainLabels)
            evaluation = tempforest.evaluate(allFoldInputArrays[i], allFoldLabelArrays[i])
            accuracy = evaluation
            allAccuracies.append(accuracy)
            everyAccuracy.append(accuracy)

        if statistics.mean(allAccuracies) > bestAccuracy:
            bestAccuracy = statistics.mean(allAccuracies)
            bestForestSize = forestSize

    avgAccuracy = statistics.mean(everyAccuracy)
    print("Best forest size: " + str(bestForestSize))
    print("Best accuracy: " + str(bestAccuracy))
    print("Average accuracy: " + str(avgAccuracy))
## Setup Data:
trainingInputs, trainingLabels, numFeatures = read_libsvm_default(
    'data/data-splits/data.train')
testInputs, testLabels, _ = read_libsvm_default('data/data-splits/data.test',
                                                numFeatures)
trainingInputsArr = trainingInputs.toarray()
testInputsArr = testInputs.toarray()


## Discretize data:
def discreteizeData(nonDiscreteArr):
    means = np.mean(nonDiscreteArr, axis=0)
    for i in range(len(nonDiscreteArr)):
        for j in range(len(nonDiscreteArr[i])):
            if nonDiscreteArr[i][j] <= means[j]:
                nonDiscreteArr[i][j] = 0
            else:
                nonDiscreteArr[i][j] = 1
    return nonDiscreteArr


discreteizeData(trainingInputsArr)

hachi = randomForest(numFeatures, 50)
hachi.train(trainingInputsArr, trainingLabels)
print("training set: ")
print(hachi.evaluate(trainingInputsArr, trainingLabels))
print("test set: ")
print(hachi.evaluate(testInputsArr, testLabels))
Example #4
0
# plot ROC curve for test set
perform_lr.roc_auc_curve(title="Logistic Regression Classifier (ROC)")

# plot confusion matrix
cm_lr = confusion_matrix(y_test, y_test_pred_lr)
perform_lr.confusion_matrix(cm_lr, title = "Logistic Regression Classifier (Confusion Matrix)")


#########################################
#Random Forest
print("Running Random Forest Classifier...")

params_rf = {"n_estimators": [10, 30, 50, 70, 90, 100, 120, 160, 200, 240], 
             "min_samples_split": [2, 4, 6, 8], 
             "min_samples_leaf": [1, 2, 3, 4]}
rf = randomForest(params_rf, X_train, y_train)
best_rf = rf.get_best_model()
logging.info("Best parameters for random forest: {}\n".format(rf.best_params))
best_rf.fit(X_train, y_train)
y_train_pred_rf = best_rf.predict(X_train)
y_test_pred_rf = best_rf.predict(X_test)

AUC_train_rf = multiclass_roc_auc_score(y_train, y_train_pred_rf)
AUC_test_rf = multiclass_roc_auc_score(y_test, y_test_pred_rf)

print("AUC for training set is: " + str(AUC_train_rf))
print("AUC for test set is: " + str(AUC_test_rf))

logging.info("AUC of {} on training data: {}".format("random forest", AUC_train_rf))
logging.info("AUC of {} on test data: {}".format("random forest", AUC_test_rf))
Example #5
0
def rf(data, labels, test_features=None):

    from random_forest import randomForest
    from helpers import helpers as hp
    from decision_tree import decisionTree
    h = hp()
    rf = randomForest()
    dt = decisionTree()

    data = pd.concat([data, labels], axis=1)
    # print(data)

    accuracy = []
    precision = []
    recall = []
    f_score = []
    models = []
    fb_score = []

    foldSize = int(data.shape[0] / 5)
    for i in range(5):
        print("Running iteration " + str(i+1) + " of k cross validation")
        testData = data.loc[foldSize*i:foldSize*(i+1)-1]
        trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
        forest = rf.forest(trainData)
        target = testData.iloc[:,-1].values.tolist()
        predicted = rf.predictForest(testData.iloc[:, :-1], forest)
        models.append(forest)
        calMetrics(target, predicted)
        # truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target)
        # print(truePositives, trueNegatives, falsePositives, falseNegatives)
        # accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
        # tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
        # tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
        # precision.append(tmpPrecision)
        # recall.append(tmpRecall)
        # tm_fscore = h.findFMeasure(tmpPrecision, tmpRecall)
        # print(tm_fscore)
        # f_score.append(tm_fscore)
    
    h.calculateMetrics(accuracy, precision, recall, f_score)

    
    
    # print(accuracy, precision, recall, f_score)
    # h.calculateMetrics(accuracy, precision, recall, f_score)

    # ind = f_score.index(min(f_score))
    # print(f_score[ind])
    # pred = rf.predictForest(test_features, models[ind])
    # print(pred)
    predicted = pd.DataFrame()
    for root in models:
        pred = dt.predictData(test_features, root)
        predicted = pd.concat([predicted, pd.DataFrame(pred)], axis=1)

    print(predicted)

    p = pd.DataFrame()

    p = []
    for idx, row in predicted.iterrows():
        p.append(row.value_counts().index.tolist()[0])

    print(p)

    return p
Example #6
0
    def random_forest(self, kCrossValidation):
        print("\nRunning Random Forest Classifier ....................\n")
        from random_forest import randomForest
        h = hp()
        fileName = h.get_fileName()
        filePath = "../Data/" + fileName + ".txt"
        # filePath = "CSE-601/project3/Data/"+fileName+".txt"
        data, labels = h.readData(filePath)
        data = h.oneHotEncoding(data, labels)
        rf = randomForest()

        try:
            numTrees = int(input("\nEnter number of trees: "))
            numFeatures = int(input("Enter number of features to consider: "))
        except:
            print("\nExecution Failed - Wrong Input")
            exit()

        accuracy = []
        precision = []
        recall = []
        f_score = []
        models = []

        foldSize = int(data.shape[0] / kCrossValidation)
        for i in range(kCrossValidation):
            print("Running iteration " + str(i + 1) +
                  " of k cross validation .....")
            testData = data.loc[foldSize * i:foldSize * (i + 1) - 1]
            trainData = data.loc[:foldSize * i - 1].append(data.loc[foldSize *
                                                                    (i + 1):])
            forest = rf.forest(trainData,
                               numTrees=numTrees,
                               numFeatures=numFeatures)
            target = testData.iloc[:, -1].values.tolist()
            predicted = rf.predictForest(testData.iloc[:, :-1], forest)
            models.append(forest)
            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(
                predicted, target)
            accuracy.append(
                h.findAccuracy(truePositives, trueNegatives, falsePositives,
                               falseNegatives))
            tmpPrecision = h.findPrecision(truePositives, trueNegatives,
                                           falsePositives, falseNegatives)
            tmpRecall = h.findRecall(truePositives, trueNegatives,
                                     falsePositives, falseNegatives)
            precision.append(tmpPrecision)
            recall.append(tmpRecall)
            f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))

        print("\nMetrics on train data with k-cross validation")
        h.calculateMetrics(accuracy, precision, recall, f_score)

        fileName = input(
            "\nEnter test data file name without extension (if no test file, just press enter): "
        )
        if fileName != '':
            filePath = "../Data/" + fileName + ".txt"
            # filePath = "CSE-601/project3/Data/"+fileName+".txt"
            testData, testLabels = h.readData(filePath)
            testData = h.oneHotEncoding(testData, testLabels)
            predLabels = []
            for forest in models:
                predLabels.append(rf.predictForest(testData, forest))
            predLabels = pd.DataFrame(predLabels)
            pred = []
            for _, colData in predLabels.iteritems():
                pred.append(colData.value_counts().index[0])
            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(
                pred, testData.iloc[:, -1].values.tolist())
            accuracy = [
                h.findAccuracy(truePositives, trueNegatives, falsePositives,
                               falseNegatives)
            ]
            precision = h.findPrecision(truePositives, trueNegatives,
                                        falsePositives, falseNegatives)
            recall = h.findRecall(truePositives, trueNegatives, falsePositives,
                                  falseNegatives)
            f_score = [h.findFMeasure(precision, recall)]
            print("\nMetrics on test data with bagging")
            h.calculateMetrics(accuracy, [precision], [recall], f_score)