Esempio n. 1
def bagging(X,Y,N,M,F):
# 1. split your dataset into a test set and a "remainder set"
    x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y)
# 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier
#    for each N sample:
#        ~63% of the remainder set will be sampled into training set
#        ~37% will be leftover for this tree's validation set
    forest = []
    # accuracies = [[0] for i in range(N)]
    accuracies = {}
    for i in range(N):
        x_train, y_train = compute_bootstrapped_sample(x_remainder, y_r) #get the bootstrap sample
        tree = my_class.MyDecisionTreeClassifier(), y_train, True, F) #build classifier
        # get remainder of x_train and use as validation set
        x_v = []
        y_v = []
        for j in range(len(x_remainder)):
            if x_remainder[j] not in x_train:
        pred = tree.predict(x_v)
        accuracy = get_accuracy(y_v, pred)
        accuracies[str(i)] = accuracy # {i: accuracy, }
# 3. measure the performance of the tree on the validation set and select the best M of N
#   trees based on the performance metrics
    best_trees_dict = best_M(M, accuracies)
    best_trees = []
    for key in best_trees_dict:

    return best_trees
Esempio n. 2
def tune_parameters(M, N, F, dataset):
    print("M =", M, "N =", N, "F =", F)
    adjusted_dataset = select_random_attributes(F,
    for i in range(5):
        X, y = split_x_y_train(adjusted_dataset)
        x_train, x_test, y_train, y_test = myevaluation.train_test_split(
            X, y, shuffle=True)

        remainder = []

        for j in range(len(x_train)):
            row = x_train[j]
        myRF = MyRandomForestClassifier(), M, N)
        y_predict_rf = myRF.predict(x_test)
        count = 0
        for l in range(len(y_predict_rf)):
            binned_predict = get_useful_bin(y_predict_rf[l])
            binned_test = get_useful_bin(y_test[l])
            if (binned_predict == binned_test):
                count = count + 1

        accuracy = count / len(y_predict_rf)
        error = (len(y_predict_rf) - count) / len(y_predict_rf)
        print(i, "-- accuracy =", accuracy, "error =", error)
Esempio n. 3
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        header = ['att' + str(i) for i in range(len(X_train[0]))]
        attribute_domains = {}
        for i, val in enumerate(header):
            attribute_domains[val] = myutils.unique_index(X_train, i)

        self.X_train = X_train
        self.y_train = y_train
        sample_X_train, sample_x_test, sample_y_train, sample_y_test = myevaluation.train_test_split(
            X_train, y_train, test_size=0.33, shuffle=True)
        train = [
            sample_X_train[i] + [sample_y_train[i]]
            for i in range(len(sample_X_train))

        for _ in range(self.N):
            available_attributes = header.copy()
                    available_attributes, attribute_domains, header, self.F))

        accuracies = []
        for tree in self.trees:
            header = ['att' + str(i) for i in range(len(sample_x_test[0]))]
            prediction = []
            for row in sample_x_test:
                prediction.append(myutils.tdidt_predict(header, tree, row))
            accuracy = 0
            for i in range(len(prediction)):
                if prediction[i] == sample_y_test[i]:
                    accuracy += 1
            accuracy /= len(sample_y_test)
        # find m most accurate
        m_trees = []
        for i in range(len(accuracies)):
        accuracies = sorted(accuracies)
        for i in range(self.M):
            m_trees.append(self.trees[accuracies[-(i + 1)][1]])
        self.trees = m_trees
Esempio n. 4
def bagging(X, Y, N, M, F):
    # 1. split your dataset into a test set and a "remainder set"
    x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y)
    # 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier
    #    for each N sample:
    #        ~63% of the remainder set will be sampled into training set
    #        ~37% will be leftover for this tree's validation set
    forest = []
    # accuracies = [[0] for i in range(N)]
    accuracies = {}
    for i in range(N):
        x_train, y_train = compute_bootstrapped_sample(
            x_remainder, y_r)  #get the bootstrap sample
        tree = my_class.MyDecisionTreeClassifier(), y_train, True, F)  #build classifier
        # get remainder of x_train and use as validation set
        x_v = []
        y_v = []
        for j in range(len(x_remainder)):
            if x_remainder[j] not in x_train:
        pred = tree.predict(x_v)
        accuracy = get_accuracy(y_v, pred)
        accuracies[str(i)] = accuracy  # {i: accuracy, }

# 3. measure the performance of the tree on the validation set and select the best M of N
#   trees based on the performance metrics
    best_trees_dict = best_M(M, accuracies)
    best_trees = []
    for key in best_trees_dict:
# 4. using majority voting, make predictions from the M learners for each instance in the test set
    all_predictions = []  # [[predictions1],[predictions2]...]
    for tree in best_trees:
        pred = tree.predict(x_test)
        all_predictions.append(pred)  #think about this like flipping a table
    #get the majority for every single row
    pred_header = build_header(
        all_predictions)  #turn all predictions into a mypy
    pred_mypy = MyPyTable(pred_header, all_predictions)
    voted_predictions = []
    for i in range(
    ):  #loop through every x_test, create a column of predictions, pick the pred by majority rule
        pred_col = pred_mypy.get_column(i)
        vals, counts = get_freq_str(pred_col)
        j = counts.index(max(counts))
        y_predict = vals[j]

    forest_accuracy = get_accuracy(y_test, voted_predictions)
    return best_trees, voted_predictions, forest_accuracy
def test_random_forest_fit():
    # interview dataset
    table = [["Senior", "Java", "no", "no", "False"],
             ["Senior", "Java", "no", "yes", "False"],
             ["Mid", "Python", "no", "no", "True"],
             ["Junior", "Python", "no", "no", "True"],
             ["Junior", "R", "yes", "no", "True"],
             ["Junior", "R", "yes", "yes", "False"],
             ["Mid", "R", "yes", "yes", "True"],
             ["Senior", "Python", "no", "no", "False"],
             ["Senior", "R", "yes", "no", "True"],
             ["Junior", "Python", "yes", "no", "True"],
             ["Senior", "Python", "yes", "yes", "True"],
             ["Mid", "Python", "no", "yes", "True"],
             ["Mid", "Java", "yes", "no", "True"],
             ["Junior", "Python", "no", "yes", "False"]]

    X, y = myutils.split_x_y_train(table)
    x_train, x_test, y_train, y_test = myevaluation.train_test_split(
        X, y, math.floor(len(table) * 0.33), shuffle=True)
    remainder = []
    for i in range(len(x_train)):
        row = x_train[i]


    myRF = MyRandomForestClassifier(), 10, 100)

    y_predicted = myRF.predict(x_test)

    assert len(y_predicted) == len(y_test)

    count = 0
    for i in range(len(y_predicted)):
        if y_predicted[i] == y_test[i]:
            count += 1

    assert count != 0
Esempio n. 6
from mysklearn.myclassifiers import MyDecisionTreeClassifier
from mysklearn.mypytable import MyPyTable
import mysklearn.myutils as myutils
import mysklearn.myevaluation as myevaluation

stars_table = myutils.load_data("Stars.csv")
temperature = myutils.temp_bins(stars_table.get_column('Temperature'))
L = myutils.luminosity_bins(stars_table.get_column('L'))
R = myutils.get_radius(stars_table.get_column('R'))
a_m = myutils.get_magnitude(stars_table.get_column('A_M'))
color = myutils.categorize_colors(stars_table.get_column('Color'))
spectral_class = myutils.get_spectral_class(stars_table.get_column('Spectral_Class'))
star_type = stars_table.get_column('Type')

x_vals = [[temperature[i], str(L[i]), str(R[i]), str(a_m[i]), color[i], spectral_class[i]] for i in range(len(]
y_vals = star_type

xtr, xts, ytr, yts = myevaluation.train_test_split(x_vals, y_vals)

my_tree = MyDecisionTreeClassifier(), ytr)

predicted = my_tree.predict(xts)
accuracy = myutils.compute_accuracy(predicted, yts)
print('My Decision Tree: Accuracy =', round(accuracy * 100, 3), 'Error Rate = ', round((1-accuracy) * 100, 3))

# pickle classifier
with open("decision_tree.p", "wb") as fout:
    pkl_obj = my_tree.tree
    pickle.dump(my_tree, fout)
Esempio n. 7
    def fit(self, X_train, y_train, N, M, F):
        ''' Fits a random forest to the training data

        Args: X_train: the data to train the random forest
                N: number of trees to be generated
                M: "best M" trees
                F: Number of attributes to select from

        self.N = N 
        self.M = M 
        self.F = F 

        xRemainder, xTest, yRemainder, yTest = myeval.train_test_split(X_train, y_train) # split into remainder and test sets

        remainderSet = []
        testSet = []

        # piece together test and remainder sets
        for i in range(len(xTest)):
            testSet.append(xTest[i] + [yTest[i]])
        for i in range(len(xRemainder)):
            remainderSet.append(xRemainder[i] + [yRemainder[i]] + [i]) # add index at end for uniqueness

        allTrees = []
        allAccuracies = []
        for j in range(N):
            copySet = copy.deepcopy(remainderSet)
            validationSet = []
            bootstrapSample = []
            bootstrapSample = myutils.computeBootstrappedSample(copySet) # create a bootstrap sample training set

            # determine the validation set
            for i in range(len(remainderSet)):
                if copySet[i] not in bootstrapSample:
            yTest = []
            for i in range(len(validationSet)):

            yTrain = []
            #print("BOoT", bootstrapSample)
            for i in range(len(bootstrapSample)):
                bootstrapSample[i] = bootstrapSample[i][:-2]

            decisionTree = MyDecisionTreeClassifier()
  , yTrain, F)
            predictions = decisionTree.predict(validationSet)

            currAccuracy = myutils.determineAccuracy(predictions, yTest)

        bestMTrees = []
        for k in range(M):
            index = allAccuracies.index(max(allAccuracies))

        self.bestM = bestMTrees

        return testSet
Esempio n. 8
    def fit(self, X_train, y_train, M=7, N=20, F=2):
        """Fits a random forest classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        self.X_train = copy.deepcopy(X_train)
        self.y_train = copy.deepcopy(y_train)

        # create random stratified test set with 2:1 ratio
        X_remainder, X_test, y_remainder, y_test = myevaluation.train_test_split(
            copy.deepcopy(X_train), copy.deepcopy(y_train))

        for i, x in enumerate(y_remainder):
        for i, x in enumerate(y_test):
        # generate N random decision trees using bagging
        trees = []
        for i in range(N):
            # print(i)
            # print("getting sample and validation sets...")
            # get the sample and validation sets
            sample = myutils.compute_bootstrapped_sample(X_remainder)
            validation_set = []
            for x in X_remainder:
                if x not in sample:
            # print("length of sample and validation sets:", len(sample), len(validation_set))
            # print("getting the tree...")
            # get the tree from the sample
            available_attributes = myutils.get_available_attributes(sample)
            tree = myutils.tdidt_random_forest(
                sample, [x for x in range(0,
                                          len(sample[0]) - 1)],
                available_attributes, F)

            # print("testing the tree")
            # test against the validation set
            validation_set_x = [x[:-1] for x in validation_set]
            validation_set_y = [x[-1] for x in validation_set]
            predictions = []
            header = []
            for i in range(0, len(validation_set_x[0])):
                header.append("att" + str(i))
            for x, y in zip(validation_set_x, validation_set_y):
                prediction = myutils.tdidt_predict(header, tree, x)
                predictions.append(int(prediction == y))

            # print("accuracy:", sum(predictions)/len(predictions))
                "accuracy": sum(predictions) / len(predictions),
                "tree": tree

        # print("getting the best M trees")
        # get the best M of N trees
        trees = sorted(trees, key=lambda k: k["accuracy"], reverse=True)
        self.trees = trees[:M]
Esempio n. 9
def test_decision_tree_classifier_fit():

    X_tr, X_t, y_tr, y_t = myevaluation.train_test_split(
        interviewData, interviewClasses), y_tr)
    assert len(interviewTest.best_M_trees) == M