def test_all_file(self):
        options = {
            'df': pd.read_csv("benchmark.csv", sep=';'),
            'label_column': "Joga",
            'n_trees': 5,
            'bootstrap_size': 10
        }
        tr = RandomForest()
        model = tr.train(options)

        for _, row in options['df'].iterrows():
            target_label = row["Joga"]
            predicted = model.predict(row.drop("Joga"))
            self.assertEqual(target_label, predicted)
    def test_benchmark(self):
        options = {
            'df': pd.read_csv("benchmark.csv", sep=';'),
            'label_column': "Joga",
            'n_trees': 5,
            'bootstrap_size': 10
        }
        tr = RandomForest()
        model = tr.train(options)

        inf_data = pd.Series(
            ["Ensolarado", "Quente", "Normal", "Verdadeiro"],
            index=["Tempo", "Temperatura", "Umidade", "Ventoso"],
            name="InferenceData")
        self.assertEqual(model.predict(inf_data), 'Sim')
def compare(binary=0, dataset_index=0, samples=10):

    datasets = [load_iris(), load_breast_cancer(), load_wine(), load_digits()]
    dataset = datasets[dataset_index]

    if binary == 1:
        print(
            "Binary Comparison is only comparing between data in classes 0 and 1"
        )
        x, y = make_two_class(dataset.data, dataset.target)
    else:
        x = dataset.data
        y = dataset.target

    #classifier = Tree()
    #name = "Tree"
    #run_cross_validation(name, classifier, x, y, samples)

    classifier = RandomForest()
    name = "RandomForest"
    run_cross_validation(name, classifier, x, y, samples)

    if binary == 1:
        classifier = AdaBoost()
        name = "AdaBoost"
        run_cross_validation(name, classifier, x, y, samples)
def manual_validation(type,
                      dataset,
                      samples,
                      stopping_criterion=0,
                      n_trees=10,
                      max_features=None,
                      n_stumps=100):
    """
    This is a manual version of the cross validation
    test. I'm using this so I can output both training
    and test error on a per iteration basis, used for
    the exploration functions
    """
    average_train = 0
    average_test = 0
    average_time = 0
    print("\nTrain\tTest\tTrain Time")
    for i in range(samples):
        data = dataset.data
        target = dataset.target
        if type == "adaboost":
            data, target = make_two_class(data, target)
        x_train, x_test, y_train, y_test = train_test_split(data,
                                                            target,
                                                            test_size=0.2)
        if type == "tree":
            classifier = Tree()
        elif type == "randomforest":
            classifier = RandomForest(n_trees=n_trees,
                                      max_features=max_features)
        else:
            classifier = AdaBoost(n_stumps=n_stumps)

        tic = time.clock()
        if type == "tree":
            classifier.fit(x_train, y_train, stopping_criterion)
        else:
            classifier.fit(x_train, y_train)

        toc = time.clock()
        train_score = classifier.score(x_train, y_train)
        test_score = classifier.score(x_test, y_test)

        average_train += train_score
        average_test += test_score
        time_diff = toc - tic
        average_time += time_diff

        print("%0.2f\t%0.2f\t%0.4f" %
              (train_score * 100, test_score * 100, time_diff))

    total_toc = time.clock
    print("\n=== Averages ===")
    print("%0.2f\t%0.2f\t%0.4f" %
          (average_train * 100 / samples, average_test * 100 / samples,
           average_time / samples))
def train_random_forest(train, test, model_parameters=[5, 10, 1, 1.0]):
    n_trees = int(model_parameters[0])
    max_depth = int(model_parameters[1])
    min_size = int(model_parameters[2])
    sample_size = int(model_parameters[3])
    num_classifier = len(train)
    predicted_label = numpy.zeros((num_classifier, test.shape[0]))
    predicted_max_label = numpy.zeros((test.shape[0]))
    for i in range(num_classifier):
        for j in range(i):
            randomforest = RandomForest(train[i], train[j])
            label_1, label_2 = randomforest.evaluate_algorithm(
                test[:, 0:-1], max_depth, min_size, sample_size, n_trees,
                (train[0].shape[1] - 1))
            predicted_label[i, :] = predicted_label[i, :] + label_1
            predicted_label[j, :] = predicted_label[j, :] + label_2
    compare_matrix = (predicted_label == numpy.max(predicted_label, axis=0))
    for i in range(compare_matrix.shape[1]):
        for j in range(compare_matrix.shape[0]):
            if (compare_matrix[j][i] == 1):
                predicted_max_label[i] = j
    print_report('Random forest', test[:, -1], predicted_max_label)
Beispiel #6
0
def main():

	#Instance of RandomForest class with 100 decision trees.
	rf = RandomForest(100, "SSH")

	#Split dataset into training and testing data randomly.
	train_features, test_features, train_labels, test_labels = rf.splitDataset("Dataset/Bruteforce/SSH.csv")

	print('Training Features Shape:', train_features.shape)
	print('Training Labels Shape:', len(train_labels))
	print('Testing Features Shape:', test_features.shape)

	#Train the model and compute metrics.
	rf.trainModel(train_features, train_labels)
	rf.computeMetrics(test_features, test_labels)
Beispiel #7
0
def successful_example():
    forest = RandomForest("mushrooms",
                          n_boostrap=50,
                          n_features=10,
                          test_size=0.2,
                          n_trees=10,
                          tree_max_depth=10
                          )

    forest.test_model()
    forest.print_forest()
    print("Successful forest accuracy " + str(forest.accuracy * 100) + "%")
Beispiel #8
0
def fail_example():
    forest = RandomForest("diabetes",
                          n_boostrap=50,
                          n_features=8,
                          test_size=0.2,
                          n_trees=20,
                          tree_max_depth=10
                          )

    forest.test_model()
    forest.print_forest()
    print("Unsuccessful forest accuracy " + str(forest.accuracy * 100) + "%")
Beispiel #9
0
# decision tree
tree = DecisionTree(5, train_data.shape[0])
tree.train(train_data, train_label)
res = tree.predict(validation_data)
score = 0
for i in range(len(res)):
    if res[i] == validation_label[i]:
        score += 1
score /= len(res)
print(score)


# random forest

forest = RandomForest(100,5,train_data.shape[0],6)
forest.train(train_data, train_label)
res = forest.predict(validation_data)

score = 0
for i in range(len(res)):
    if res[i] == validation_label[i]:
        score += 1
score /= len(res)
print(score)


# write to csv
# with open('titanic_prediction.csv', 'wt') as f:
#     writer = csv.writer(f, delimiter=',')
#     writer.writerow(['Id', 'Category'])
Beispiel #10
0
from KFoldValidation import KFoldValidation
from randomForest import RandomForest
import pandas as pd
import random
import numpy as np

seed = 5
np.random.seed(seed)
random.seed(seed)

df = pd.read_csv('datasets/iris.data')

options = {
    'train_algorithm': RandomForest(),
    'df': df,
    'label_column': 'Y',
    'num_folds': 5,
    'n_trees': 15,
    'bootstrap_size': 2,
}

runner = KFoldValidation()
runner.train_with_kfold(options)
import pandas as pd
import numpy as np
from randomForest import RandomForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

train_x = pd.read_csv('./data/x_train.csv')
train_y = pd.read_csv('./data/y_train.csv')

train_data = pd.merge(train_x, train_y)
forest = RandomForest(depth=5, min_sample_leaf=13, min_gini=0.001, n_tree=20)
train_set, eval_set = train_test_split(train_data, test_size=0.2)
forest.fit(train_set)
result = forest.predict(eval_set)
forest.save()

print('ac ', accuracy_score(eval_set['label'], result))
print('precision ', precision_score(eval_set['label'], result))
print('recall ', recall_score(eval_set['label'], result))
print('f1_score ', f1_score(eval_set['label'], result))
Beispiel #12
0
tree = DecisionTree(10, train_data.shape[0])
tree.train(train_data, train_label)
res = tree.predict(validation_data[:1,:])


score = 0
for i in range(len(res)):
    if res[i] == validation_label[i]:
        score += 1
score /= len(res)
print(score)


# random forest

rf = RandomForest(10,10,train_data.shape[0],train_data.shape[1])
rf.train(train_data,train_label)
res = rf.predict(validation_data)
score = 0
for i in range(len(res)):
    if res[i] == validation_label[i]:
        score += 1
score /= len(res)
print(score)

# with open('titanic_prediction.csv', 'wt') as f:
#     writer = csv.writer(f, delimiter=',')
#     writer.writerow(['Id', 'Category'])
#     for i, cat in enumerate(res):
#         writer.writerow([str(i + 1), str(cat)])
indices = np.arange(150)
np.random.shuffle(indices)
train_dx, test_idx = indices[:100], indices[100:]
Xtrain, Ytrain = X[train_dx], Y[train_dx]
Xtest, Ytest = X[test_idx], Y[test_idx]

# test of decision tree
# model = dtc(6,10)
# model.build_tree(Xtrain,Ytrain)
# print('-------------------------------------------------------------------------------------------------------')

# predicted_calsses = model.predict(Xtest)

# score= 0
# for i in range(len(predicted_calsses)):
#     score += (predicted_calsses[i]==Ytest[i])

# print("the algorithm has an accuracy of ",score/len(predicted_calsses))

# test of random forest
m = RandomForest(3, 2, 5)
m.build_forest(Xtrain, Ytrain)

predicted_calsses = m.predict(Xtest)

score = 0
for i in range(len(predicted_calsses)):
    score += (predicted_calsses[i] == Ytest[i])

print("the algorithm has an accuracy of ", score / len(predicted_calsses))
Beispiel #14
0
import pandas as pd
import numpy as np
from randomForest import RandomForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

train_x = pd.read_csv('./data/x_train.csv')
train_y = pd.read_csv('./data/y_train.csv')

train_data = pd.merge(train_x, train_y)
forest = RandomForest(depth=5, min_sample_leaf=13, min_gini=0.001, n_tree=20)
train_set, eval_set = train_test_split(train_data, test_size=0.2)
forest.load()
result = forest.predict(eval_set)

print('ac ', accuracy_score(eval_set['label'], result))
print('precision ', precision_score(eval_set['label'], result))
print('recall ', recall_score(eval_set['label'], result))
print('f1_score ', f1_score(eval_set['label'], result))