Ejemplo n.º 1
0
 def fit(self, X_train, y_train):
     # Set X_train
     self.X_train = X_train
     # Set y_train
     self.y_train = y_train
     # Set header
     header = ['att' + str(i) for i in range(len(self.X_train[0]))]
     # Delete header
     del self.X_train[0]
     # Set train
     train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
     # Create forest array
     forest = []
     # Traverse
     for _ in range(self.N):
         # Create tree dictionary
         tree = {}
         # Set attributes
         tree['atts'] = myutils.compute_random_subset(header[:-1], self.F)
         # Get train set and test set
         train_set, test_set = myutils.compute_bootstrapped_sample(train)
         # Set tree
         tree['tree'] = self.get_tree(train_set,
                                      tree['atts'] + [header[-1]])
         # Set accuracy
         tree['accuracy'] = self.compute_tree_accuracy(tree, test_set)
         # Append tree to forest
         forest.append(tree)
     # Sort
     sort = sorted(forest, key=itemgetter('accuracy'), reverse=True)
     # Set forest
     self.forest = sort[:self.M]
def test_random_forest_fit():
    interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"]
    interview_table = [["Senior", "Java", "no", "no", "False"],
                       ["Senior", "Java", "no", "yes", "False"],
                       ["Mid", "Python", "no", "no", "True"],
                       ["Junior", "Python", "no", "no", "True"],
                       ["Junior", "R", "yes", "no", "True"],
                       ["Junior", "R", "yes", "yes", "False"],
                       ["Mid", "R", "yes", "yes", "True"],
                       ["Senior", "Python", "no", "no", "False"],
                       ["Senior", "R", "yes", "no", "True"],
                       ["Junior", "Python", "yes", "no", "True"],
                       ["Senior", "Python", "yes", "yes", "True"],
                       ["Mid", "Python", "no", "yes", "True"],
                       ["Mid", "Java", "yes", "no", "True"],
                       ["Junior", "Python", "no", "yes", "False"]]
    myutils.prepend_attribute_label(interview_table, interview_header)

    interview_pytable = MyPyTable(column_names=interview_header,
                                  data=interview_table)
    y_col = interview_pytable.get_column("interviewed_well", False)
    x_cols = interview_pytable.drop_col("interviewed_well")

    many_trees = MyRandomForestClassifier()
    X_sample, y_sample = myutils.compute_bootstrapped_sample(x_cols, y_col)
    X_train, X_test, y_train, y_test = myutils.train_test_split(
        X_sample, y_sample, .33)
    many_trees.fit(X_train, y_train, X_test, y_test)
    y_predicted = many_trees.predict(X_test)

    numCorrectPredictions = 0
    numWrongPredictions = 0
    for i in range(len(y_test)):
        values = [y_predicted[i], y_test[i]]  #predicted/actual
        if (values[0] == values[1]):
            numCorrectPredictions = numCorrectPredictions + 1
        else:
            numWrongPredictions = numWrongPredictions + 1

    accuracy = np.round((numCorrectPredictions) /
                        (numCorrectPredictions + numWrongPredictions), 3)
    error_rate = np.round(
        (numWrongPredictions) / (numCorrectPredictions + numWrongPredictions),
        3)

    print("-----------------------------------------------------------")
    print("Accuracy and Error Rate")
    print("-----------------------------------------------------------")
    print()
    print("Random Forest: accuracy = {}, error rate = {}".format(
        accuracy, error_rate))
    print()
    print(
        "Because of the random aspect of this classifier, this will not always pass the tests"
    )
    print()
    print("Predicted table: " + str(y_predicted))
    print("Testing set:     " + str(y_test))
    for i in range(len(y_test)):
        assert y_predicted[i] == y_test[i]
Ejemplo n.º 3
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        header = ['att' + str(i) for i in range(len(X_train[0]))]
        attribute_domains = {}
        for i, val in enumerate(header):
            attribute_domains[val] = myutils.unique_index(X_train, i)

        self.X_train = X_train
        self.y_train = y_train
        sample_X_train, sample_x_test, sample_y_train, sample_y_test = myevaluation.train_test_split(
            X_train, y_train, test_size=0.33, shuffle=True)
        train = [
            sample_X_train[i] + [sample_y_train[i]]
            for i in range(len(sample_X_train))
        ]

        for _ in range(self.N):
            available_attributes = header.copy()
            self.trees.append(
                myutils.tdidt_forest(
                    myutils.compute_bootstrapped_sample(train),
                    available_attributes, attribute_domains, header, self.F))

        accuracies = []
        for tree in self.trees:
            header = ['att' + str(i) for i in range(len(sample_x_test[0]))]
            prediction = []
            for row in sample_x_test:
                prediction.append(myutils.tdidt_predict(header, tree, row))
            accuracy = 0
            for i in range(len(prediction)):
                if prediction[i] == sample_y_test[i]:
                    accuracy += 1
            accuracy /= len(sample_y_test)
            accuracies.append([accuracy])
        # find m most accurate
        m_trees = []
        for i in range(len(accuracies)):
            accuracies[i].append(i)
        accuracies = sorted(accuracies)
        for i in range(self.M):
            m_trees.append(self.trees[accuracies[-(i + 1)][1]])
        self.trees = m_trees
Ejemplo n.º 4
0
    def fit(self, X_tr, y_tr):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.
        Args:
            X_tr(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_tr(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        X_train, X_test, y_train, y_test = myutils.random_stratified_split(X_tr, y_tr)

        trees = []
        for jj in range(self.N):
            tree = None
            self.header = ["att" + str(ii) for ii in range(len(X_train[0]))]
            for jj in range(len(X_train[0])):
                temp = []
                for ii in range(len(X_train)):
                    if X_train[ii][jj] not in temp:
                        temp.append(X_train[ii][jj])
                self.domain[self.header[jj]] = temp
            
            # my advice is to stitch together X_train and y_train
            train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
            bootstrapped_train = myutils.compute_bootstrapped_sample(train)
            # initial call to tdidt current instances is the whole table (train)
            available_attributes = self.header.copy()
            tree = self.tdidt(bootstrapped_train, available_attributes)
            trees.append(tree)

        performances = []
        for ii, tree in enumerate(trees):
            counter = 0
            for jj, instance in enumerate(X_test):
                if self.tdidt_predict(self.header, tree, instance) == y_test[jj]:
                    counter+=1
            performances.append(counter/len(y_test))
        
        # Thank you stackoverflow
        # https://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list
        sortedtrees = [x for _,x in sorted(zip(performances, trees))]

        self.best_M_trees = sortedtrees[:self.M]
        pass
Ejemplo n.º 5
0
    def fit(self, X_train, y_train, X_test, y_test):
        """Fits many decision tree classifiers to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
            X_test(list of list of obj): The list of testing samples, used to determine accuracy of trees
            y_test(list of obj): The target y values (parallel to X_test), used to determine accuracy of trees

        Notes:
            
        """
        # Call train_test_split to get "test" and "remainder" data (happens before this function is called)
        # Loop N times
        #   Call compute_bootstrapped_sample to get a subset of the "remainder" data
        #   Call tdidt with the bootstrapped sample to build a decision tree
        # Test performance of all trees to get the M best ones
        # Use majority voting to make predictions (predict method)
        self.X_train = X_train
        self.y_train = y_train
        N_trees = []
        for _ in range(self.N):
            new_X_train, new_y_train = myutils.compute_bootstrapped_sample(X_train, y_train)
            tree = MyDecisionTreeClassifier()
            tree.fit(new_X_train, new_y_train, is_forest=True)
            N_trees.append(tree)

        self.trees = []
        # Test tree performance
        #   for the M best trees, append them to self.trees
        accuracy_list = []
        for tree in N_trees:
            accuracy = 0
            predict_list = tree.predict(X_test)
            for i in range(len(predict_list)):
                if predict_list[i] == y_test[i]:
                    accuracy += 1
            accuracy_list.append(myutils.calculateAccuracy(accuracy, len(y_test) - accuracy))
        prev_best = 1.1
        M_indexes = np.argpartition(accuracy_list, -self.M)[-self.M:]
        for index in M_indexes:
            self.trees.append(N_trees[index])
Ejemplo n.º 6
0
    def fit(self, X_train, y_train, M=7, N=20, F=2):
        """Fits a random forest classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        """
        self.X_train = copy.deepcopy(X_train)
        self.y_train = copy.deepcopy(y_train)

        # create random stratified test set with 2:1 ratio
        X_remainder, X_test, y_remainder, y_test = myevaluation.train_test_split(
            copy.deepcopy(X_train), copy.deepcopy(y_train))

        for i, x in enumerate(y_remainder):
            X_remainder[i].append(x)
        for i, x in enumerate(y_test):
            X_test[i].append(x)
        # generate N random decision trees using bagging
        trees = []
        for i in range(N):
            # print(i)
            # print("getting sample and validation sets...")
            # get the sample and validation sets
            sample = myutils.compute_bootstrapped_sample(X_remainder)
            validation_set = []
            for x in X_remainder:
                if x not in sample:
                    validation_set.append(x)
            # print("length of sample and validation sets:", len(sample), len(validation_set))
            # print("getting the tree...")
            # get the tree from the sample
            available_attributes = myutils.get_available_attributes(sample)
            tree = myutils.tdidt_random_forest(
                sample, [x for x in range(0,
                                          len(sample[0]) - 1)],
                available_attributes, F)

            # print("testing the tree")
            # test against the validation set
            validation_set_x = [x[:-1] for x in validation_set]
            validation_set_y = [x[-1] for x in validation_set]
            predictions = []
            header = []
            for i in range(0, len(validation_set_x[0])):
                header.append("att" + str(i))
            for x, y in zip(validation_set_x, validation_set_y):
                prediction = myutils.tdidt_predict(header, tree, x)
                predictions.append(int(prediction == y))

            # print("accuracy:", sum(predictions)/len(predictions))
            trees.append({
                "accuracy": sum(predictions) / len(predictions),
                "tree": tree
            })

        # print("getting the best M trees")
        # get the best M of N trees
        trees = sorted(trees, key=lambda k: k["accuracy"], reverse=True)
        self.trees = trees[:M]
Ejemplo n.º 7
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
         # fit() accepts X_train and y_train
        # TODO: calculate the attribute domains dictionary

        if (self.seed != None):
            random.seed(self.seed)
            
        n_trees = []
        accuracies = []
        for i in range(self.N):
            header = []
            attribute_domains = {}
            
            #loops through X_train and creates header
            for i in range(len(X_train[0])) :
                header.append("att" + str(i))
            

            #loops though header to form attribute domains dictionairy
            count = 0
            for item in header:
                curr_col = myutils.get_column(X_train, count)
                values, counts = myutils.get_frequencies(curr_col)
                attribute_domains[item] = values
                count+=1
                

            #stitching together X_train and y_train and getting available attributes
            train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
            available_attributes = header.copy()

            boot_train = myutils.compute_bootstrapped_sample(train)

            validation_set = []
            for row in train:
                if row not in boot_train:
                    validation_set.append(row)



            #forming tree
            tree = myutils.tdidt_forest(boot_train, available_attributes, attribute_domains, header, self.F)
            #print(tree)

            tree_dict = {}
            tree_dict["tree"] = tree
            y_test = []
            for row in validation_set:
                y_test.append(row.pop())
            
            y_predict = myutils.predict_tree(validation_set, tree)

            acc = myutils.get_accuracy(y_predict, y_test)
            tree_dict["acc"] = acc
            n_trees.append(tree_dict)
        

        sorted_trees = sorted(n_trees, key=lambda k: k['acc'], reverse=True)
        for i in range(self.M):
            self.trees.append(sorted_trees[i]["tree"])
Ejemplo n.º 8
0
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

        # stitch together X_train and y_train so y_train is in right most column
        train = [
            self.X_train[i] + [self.y_train[i]]
            for i in range(0, len(self.X_train))
        ]

        trees = []
        attr_sets = []
        tree_accuracies = []
        for i in range(self.N):
            # Call bootstrap method to get different random data from data set (each instance has every column)
            bootstrapped_table = myutils.compute_bootstrapped_sample(train)
            bootstrapped_y = myutils.get_column_by_index(
                bootstrapped_table, -1)
            bootstrapped_X = myutils.remove_column(bootstrapped_table, -1)

            # call train test split to get X_train, y_train, X_test, y_test
            tree_X_train, tree_X_validation, tree_y_train, tree_y_validation = myutils.train_test_split(
                bootstrapped_X, bootstrapped_y, 1 / 3)
            # TODO: Randomly select F indices and make X_train those F columns
            # print(tree_X_train[:10])
            # print(tree_X_validation[:10])
            # print(tree_y_train[:10])
            # print(tree_y_validation[:10])

            num_attributes = len(tree_X_train[0])
            attr_indices = myutils.generate_F_indices(num_attributes, self.F)
            attr_indices = sorted(attr_indices)
            attr_sets.append(attr_indices)

            subsetted_tree_X_train = myutils.attribute_subset_table(
                tree_X_train, attr_indices)

            # create decision tree
            decision_tree = MyDecisionTreeClassifier()
            # print(subsetted_tree_X_train[:10])
            decision_tree.fit(subsetted_tree_X_train, tree_y_train)
            trees.append(decision_tree)

            # calculate tree accuracy using validation set
            predicted = decision_tree.predict(tree_X_validation)
            match_count = 0
            for index, prediction in enumerate(predicted):
                if prediction == tree_y_validation[index]:
                    match_count += 1
            accuracy = match_count / len(predicted)
            tree_accuracies.append(accuracy)

        # select M best trees based on accuracies
        # sort accuracies and cooresponding trees
        zipped_lists = zip(trees, tree_accuracies, attr_sets)
        sorted_zipped = sorted(zipped_lists, reverse=True, key=lambda x: x[1])
        tuples = zip(*sorted_zipped)
        sorted_trees, sorted_accuracies, sorted_attr_sets = [
            list(tuple) for tuple in tuples
        ]
        best_m_trees = sorted_trees[:self.M]
        best_sorted_attr_sets = sorted_attr_sets[:self.M]

        self.best_m_trees = best_m_trees
        self.M_attr_sets = best_sorted_attr_sets
Ejemplo n.º 9
0
    def fit(self, X_train, y_train):
        ''' Fits the random forest model to a given training set

        Args:

            X_train(list of list of obj): The list of training instances (samples).
                    The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train).
                The shape of y_train is n_samples
        '''

        self.X_train = copy.deepcopy(X_train)
        self.y_train = copy.deepcopy(y_train)
        self.learners = []
        self.accuracies = []

        # generate N learners
        for i in range(self.N):

            # create the bootstrap sample
            if self.seed is not None:
                X_sample, y_sample = myutils.compute_bootstrapped_sample(self.X_train, self.y_train, self.seed)
                self.seed += 1 # increment the seed so not all the trees are the same

            else:
                X_sample, y_sample = myutils.compute_bootstrapped_sample(self.X_train, self.y_train)


            # create the validation set
            X_val = [x for x in self.X_train if x not in X_sample]
            y_idxs = [self.X_train.index(x) for x in X_val]
            y_val = [self.y_train[idx] for idx in y_idxs]

            # get only a random subset of attributes for each sample
            values = [i for i in range(len(self.X_train[0]))] # num of items in header

            if self.seed is not None:
                F_attributes = myutils.compute_random_subset(values, self.F, self.seed)
                self.seed += 1 # increment the seed so not all the trees are the same

            else:
                F_attributes = myutils.compute_random_subset(values, self.F)

            # get only those attributes from the training set
            for i in range(len(X_sample)):
                X_sample[i] = [X_sample[i][j] for j in range(len(X_sample[i])) if j in F_attributes]

            # get only those attributes from the validation set
            for i in range(len(X_val)):
                X_val[i] = [X_val[i][j] for j in range(len(X_val[i])) if j in F_attributes]


            # build a decision tree from the sample
            tree = MyDecisionTreeClassifier()
            tree.fit(X_sample, y_sample)
            self.learners.append(tree)

            # test the trees accuracy on the validation set
            y_pred = tree.predict(X_val)

            self.accuracies.append(myutils.compute_accuracy(y_pred, y_val))

        # get only the best M learners

        # sort the dists and move the indices to match the sorted list
        # by combining the two lists into a list of tuples, sorting, and unpacking
        sorted_accs, sorted_idxs = (list(tup) for tup in zip(*sorted(zip(self.accuracies, range(len(self.learners))))))

        # slice the lists to only include the M best learners
        self.learners = [self.learners[i] for i in range(len(self.learners)) if i in sorted_idxs[:self.M]]
        # self.learners = sorted_learners[:M+1]
        self.accuracies = sorted_accs[:self.M]