Example #1
 def fit(self, X_train, y_train):
     # Set X_train
     self.X_train = X_train
     # Set y_train
     self.y_train = y_train
     # Set header
     header = ['att' + str(i) for i in range(len(self.X_train[0]))]
     # Delete header
     del self.X_train[0]
     # Set train
     train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
     # Create forest array
     forest = []
     # Traverse
     for _ in range(self.N):
         # Create tree dictionary
         tree = {}
         # Set attributes
         tree['atts'] = myutils.compute_random_subset(header[:-1], self.F)
         # Get train set and test set
         train_set, test_set = myutils.compute_bootstrapped_sample(train)
         # Set tree
         tree['tree'] = self.get_tree(train_set,
                                      tree['atts'] + [header[-1]])
         # Set accuracy
         tree['accuracy'] = self.compute_tree_accuracy(tree, test_set)
         # Append tree to forest
     # Sort
     sort = sorted(forest, key=itemgetter('accuracy'), reverse=True)
     # Set forest
     self.forest = sort[:self.M]
def test_random_forest_fit():
    interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"]
    interview_table = [["Senior", "Java", "no", "no", "False"],
                       ["Senior", "Java", "no", "yes", "False"],
                       ["Mid", "Python", "no", "no", "True"],
                       ["Junior", "Python", "no", "no", "True"],
                       ["Junior", "R", "yes", "no", "True"],
                       ["Junior", "R", "yes", "yes", "False"],
                       ["Mid", "R", "yes", "yes", "True"],
                       ["Senior", "Python", "no", "no", "False"],
                       ["Senior", "R", "yes", "no", "True"],
                       ["Junior", "Python", "yes", "no", "True"],
                       ["Senior", "Python", "yes", "yes", "True"],
                       ["Mid", "Python", "no", "yes", "True"],
                       ["Mid", "Java", "yes", "no", "True"],
                       ["Junior", "Python", "no", "yes", "False"]]
    myutils.prepend_attribute_label(interview_table, interview_header)

    interview_pytable = MyPyTable(column_names=interview_header,
    y_col = interview_pytable.get_column("interviewed_well", False)
    x_cols = interview_pytable.drop_col("interviewed_well")

    many_trees = MyRandomForestClassifier()
    X_sample, y_sample = myutils.compute_bootstrapped_sample(x_cols, y_col)
    X_train, X_test, y_train, y_test = myutils.train_test_split(
        X_sample, y_sample, .33)
    many_trees.fit(X_train, y_train, X_test, y_test)
    y_predicted = many_trees.predict(X_test)

    numCorrectPredictions = 0
    numWrongPredictions = 0
    for i in range(len(y_test)):
        values = [y_predicted[i], y_test[i]]  #predicted/actual
        if (values[0] == values[1]):
            numCorrectPredictions = numCorrectPredictions + 1
            numWrongPredictions = numWrongPredictions + 1

    accuracy = np.round((numCorrectPredictions) /
                        (numCorrectPredictions + numWrongPredictions), 3)
    error_rate = np.round(
        (numWrongPredictions) / (numCorrectPredictions + numWrongPredictions),

    print("Accuracy and Error Rate")
    print("Random Forest: accuracy = {}, error rate = {}".format(
        accuracy, error_rate))
        "Because of the random aspect of this classifier, this will not always pass the tests"
    print("Predicted table: " + str(y_predicted))
    print("Testing set:     " + str(y_test))
    for i in range(len(y_test)):
        assert y_predicted[i] == y_test[i]
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        header = ['att' + str(i) for i in range(len(X_train[0]))]
        attribute_domains = {}
        for i, val in enumerate(header):
            attribute_domains[val] = myutils.unique_index(X_train, i)

        self.X_train = X_train
        self.y_train = y_train
        sample_X_train, sample_x_test, sample_y_train, sample_y_test = myevaluation.train_test_split(
            X_train, y_train, test_size=0.33, shuffle=True)
        train = [
            sample_X_train[i] + [sample_y_train[i]]
            for i in range(len(sample_X_train))

        for _ in range(self.N):
            available_attributes = header.copy()
                    available_attributes, attribute_domains, header, self.F))

        accuracies = []
        for tree in self.trees:
            header = ['att' + str(i) for i in range(len(sample_x_test[0]))]
            prediction = []
            for row in sample_x_test:
                prediction.append(myutils.tdidt_predict(header, tree, row))
            accuracy = 0
            for i in range(len(prediction)):
                if prediction[i] == sample_y_test[i]:
                    accuracy += 1
            accuracy /= len(sample_y_test)
        # find m most accurate
        m_trees = []
        for i in range(len(accuracies)):
        accuracies = sorted(accuracies)
        for i in range(self.M):
            m_trees.append(self.trees[accuracies[-(i + 1)][1]])
        self.trees = m_trees
    def fit(self, X_tr, y_tr):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.
            X_tr(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_tr(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        X_train, X_test, y_train, y_test = myutils.random_stratified_split(X_tr, y_tr)

        trees = []
        for jj in range(self.N):
            tree = None
            self.header = ["att" + str(ii) for ii in range(len(X_train[0]))]
            for jj in range(len(X_train[0])):
                temp = []
                for ii in range(len(X_train)):
                    if X_train[ii][jj] not in temp:
                self.domain[self.header[jj]] = temp
            # my advice is to stitch together X_train and y_train
            train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
            bootstrapped_train = myutils.compute_bootstrapped_sample(train)
            # initial call to tdidt current instances is the whole table (train)
            available_attributes = self.header.copy()
            tree = self.tdidt(bootstrapped_train, available_attributes)

        performances = []
        for ii, tree in enumerate(trees):
            counter = 0
            for jj, instance in enumerate(X_test):
                if self.tdidt_predict(self.header, tree, instance) == y_test[jj]:
        # Thank you stackoverflow
        # https://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list
        sortedtrees = [x for _,x in sorted(zip(performances, trees))]

        self.best_M_trees = sortedtrees[:self.M]
Example #5
    def fit(self, X_train, y_train, X_test, y_test):
        """Fits many decision tree classifiers to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
            X_test(list of list of obj): The list of testing samples, used to determine accuracy of trees
            y_test(list of obj): The target y values (parallel to X_test), used to determine accuracy of trees

        # Call train_test_split to get "test" and "remainder" data (happens before this function is called)
        # Loop N times
        #   Call compute_bootstrapped_sample to get a subset of the "remainder" data
        #   Call tdidt with the bootstrapped sample to build a decision tree
        # Test performance of all trees to get the M best ones
        # Use majority voting to make predictions (predict method)
        self.X_train = X_train
        self.y_train = y_train
        N_trees = []
        for _ in range(self.N):
            new_X_train, new_y_train = myutils.compute_bootstrapped_sample(X_train, y_train)
            tree = MyDecisionTreeClassifier()
            tree.fit(new_X_train, new_y_train, is_forest=True)

        self.trees = []
        # Test tree performance
        #   for the M best trees, append them to self.trees
        accuracy_list = []
        for tree in N_trees:
            accuracy = 0
            predict_list = tree.predict(X_test)
            for i in range(len(predict_list)):
                if predict_list[i] == y_test[i]:
                    accuracy += 1
            accuracy_list.append(myutils.calculateAccuracy(accuracy, len(y_test) - accuracy))
        prev_best = 1.1
        M_indexes = np.argpartition(accuracy_list, -self.M)[-self.M:]
        for index in M_indexes:
Example #6
    def fit(self, X_train, y_train, M=7, N=20, F=2):
        """Fits a random forest classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        self.X_train = copy.deepcopy(X_train)
        self.y_train = copy.deepcopy(y_train)

        # create random stratified test set with 2:1 ratio
        X_remainder, X_test, y_remainder, y_test = myevaluation.train_test_split(
            copy.deepcopy(X_train), copy.deepcopy(y_train))

        for i, x in enumerate(y_remainder):
        for i, x in enumerate(y_test):
        # generate N random decision trees using bagging
        trees = []
        for i in range(N):
            # print(i)
            # print("getting sample and validation sets...")
            # get the sample and validation sets
            sample = myutils.compute_bootstrapped_sample(X_remainder)
            validation_set = []
            for x in X_remainder:
                if x not in sample:
            # print("length of sample and validation sets:", len(sample), len(validation_set))
            # print("getting the tree...")
            # get the tree from the sample
            available_attributes = myutils.get_available_attributes(sample)
            tree = myutils.tdidt_random_forest(
                sample, [x for x in range(0,
                                          len(sample[0]) - 1)],
                available_attributes, F)

            # print("testing the tree")
            # test against the validation set
            validation_set_x = [x[:-1] for x in validation_set]
            validation_set_y = [x[-1] for x in validation_set]
            predictions = []
            header = []
            for i in range(0, len(validation_set_x[0])):
                header.append("att" + str(i))
            for x, y in zip(validation_set_x, validation_set_y):
                prediction = myutils.tdidt_predict(header, tree, x)
                predictions.append(int(prediction == y))

            # print("accuracy:", sum(predictions)/len(predictions))
                "accuracy": sum(predictions) / len(predictions),
                "tree": tree

        # print("getting the best M trees")
        # get the best M of N trees
        trees = sorted(trees, key=lambda k: k["accuracy"], reverse=True)
        self.trees = trees[:M]
Example #7
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
         # fit() accepts X_train and y_train
        # TODO: calculate the attribute domains dictionary

        if (self.seed != None):
        n_trees = []
        accuracies = []
        for i in range(self.N):
            header = []
            attribute_domains = {}
            #loops through X_train and creates header
            for i in range(len(X_train[0])) :
                header.append("att" + str(i))

            #loops though header to form attribute domains dictionairy
            count = 0
            for item in header:
                curr_col = myutils.get_column(X_train, count)
                values, counts = myutils.get_frequencies(curr_col)
                attribute_domains[item] = values

            #stitching together X_train and y_train and getting available attributes
            train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
            available_attributes = header.copy()

            boot_train = myutils.compute_bootstrapped_sample(train)

            validation_set = []
            for row in train:
                if row not in boot_train:

            #forming tree
            tree = myutils.tdidt_forest(boot_train, available_attributes, attribute_domains, header, self.F)

            tree_dict = {}
            tree_dict["tree"] = tree
            y_test = []
            for row in validation_set:
            y_predict = myutils.predict_tree(validation_set, tree)

            acc = myutils.get_accuracy(y_predict, y_test)
            tree_dict["acc"] = acc

        sorted_trees = sorted(n_trees, key=lambda k: k['acc'], reverse=True)
        for i in range(self.M):
Example #8
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

        # stitch together X_train and y_train so y_train is in right most column
        train = [
            self.X_train[i] + [self.y_train[i]]
            for i in range(0, len(self.X_train))

        trees = []
        attr_sets = []
        tree_accuracies = []
        for i in range(self.N):
            # Call bootstrap method to get different random data from data set (each instance has every column)
            bootstrapped_table = myutils.compute_bootstrapped_sample(train)
            bootstrapped_y = myutils.get_column_by_index(
                bootstrapped_table, -1)
            bootstrapped_X = myutils.remove_column(bootstrapped_table, -1)

            # call train test split to get X_train, y_train, X_test, y_test
            tree_X_train, tree_X_validation, tree_y_train, tree_y_validation = myutils.train_test_split(
                bootstrapped_X, bootstrapped_y, 1 / 3)
            # TODO: Randomly select F indices and make X_train those F columns
            # print(tree_X_train[:10])
            # print(tree_X_validation[:10])
            # print(tree_y_train[:10])
            # print(tree_y_validation[:10])

            num_attributes = len(tree_X_train[0])
            attr_indices = myutils.generate_F_indices(num_attributes, self.F)
            attr_indices = sorted(attr_indices)

            subsetted_tree_X_train = myutils.attribute_subset_table(
                tree_X_train, attr_indices)

            # create decision tree
            decision_tree = MyDecisionTreeClassifier()
            # print(subsetted_tree_X_train[:10])
            decision_tree.fit(subsetted_tree_X_train, tree_y_train)

            # calculate tree accuracy using validation set
            predicted = decision_tree.predict(tree_X_validation)
            match_count = 0
            for index, prediction in enumerate(predicted):
                if prediction == tree_y_validation[index]:
                    match_count += 1
            accuracy = match_count / len(predicted)

        # select M best trees based on accuracies
        # sort accuracies and cooresponding trees
        zipped_lists = zip(trees, tree_accuracies, attr_sets)
        sorted_zipped = sorted(zipped_lists, reverse=True, key=lambda x: x[1])
        tuples = zip(*sorted_zipped)
        sorted_trees, sorted_accuracies, sorted_attr_sets = [
            list(tuple) for tuple in tuples
        best_m_trees = sorted_trees[:self.M]
        best_sorted_attr_sets = sorted_attr_sets[:self.M]

        self.best_m_trees = best_m_trees
        self.M_attr_sets = best_sorted_attr_sets
Example #9
    def fit(self, X_train, y_train):
        ''' Fits the random forest model to a given training set


            X_train(list of list of obj): The list of training instances (samples).
                    The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train).
                The shape of y_train is n_samples

        self.X_train = copy.deepcopy(X_train)
        self.y_train = copy.deepcopy(y_train)
        self.learners = []
        self.accuracies = []

        # generate N learners
        for i in range(self.N):

            # create the bootstrap sample
            if self.seed is not None:
                X_sample, y_sample = myutils.compute_bootstrapped_sample(self.X_train, self.y_train, self.seed)
                self.seed += 1 # increment the seed so not all the trees are the same

                X_sample, y_sample = myutils.compute_bootstrapped_sample(self.X_train, self.y_train)

            # create the validation set
            X_val = [x for x in self.X_train if x not in X_sample]
            y_idxs = [self.X_train.index(x) for x in X_val]
            y_val = [self.y_train[idx] for idx in y_idxs]

            # get only a random subset of attributes for each sample
            values = [i for i in range(len(self.X_train[0]))] # num of items in header

            if self.seed is not None:
                F_attributes = myutils.compute_random_subset(values, self.F, self.seed)
                self.seed += 1 # increment the seed so not all the trees are the same

                F_attributes = myutils.compute_random_subset(values, self.F)

            # get only those attributes from the training set
            for i in range(len(X_sample)):
                X_sample[i] = [X_sample[i][j] for j in range(len(X_sample[i])) if j in F_attributes]

            # get only those attributes from the validation set
            for i in range(len(X_val)):
                X_val[i] = [X_val[i][j] for j in range(len(X_val[i])) if j in F_attributes]

            # build a decision tree from the sample
            tree = MyDecisionTreeClassifier()
            tree.fit(X_sample, y_sample)

            # test the trees accuracy on the validation set
            y_pred = tree.predict(X_val)

            self.accuracies.append(myutils.compute_accuracy(y_pred, y_val))

        # get only the best M learners

        # sort the dists and move the indices to match the sorted list
        # by combining the two lists into a list of tuples, sorting, and unpacking
        sorted_accs, sorted_idxs = (list(tup) for tup in zip(*sorted(zip(self.accuracies, range(len(self.learners))))))

        # slice the lists to only include the M best learners
        self.learners = [self.learners[i] for i in range(len(self.learners)) if i in sorted_idxs[:self.M]]
        # self.learners = sorted_learners[:M+1]
        self.accuracies = sorted_accs[:self.M]