Ejemplo n.º 1
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of obj): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """

        header = []
        predictions = []
        for i in range(0, len(X_test[0])):
            header.append("att" + str(i))
        for instance in X_test:
            tree_predictions = {}
            for tree in self.trees:
                prediction = myutils.tdidt_predict(header, tree["tree"],
                                                   instance)
                if prediction in tree_predictions:
                    tree_predictions[prediction] += 1
                else:
                    tree_predictions[prediction] = 1

            max_key = max(tree_predictions, key=tree_predictions.get)
            predictions.append(max_key)
        return predictions
 def predict(self, X_test):
     """Makes predictions for test instances in X_test.
     Args:
         X_test(list of list of obj): The list of testing samples
             The shape of X_test is (n_test_samples, n_features)
     Returns:
         y_predicted(list of obj): The predicted target y values (parallel to X_test)
     """
     y_predicted = []
     all_predictions = []
     for test in X_test:
         temp = []
         for i in range(len(self.trees)):
             tree = self.trees[i]
             heading = []
             test_sub_set = []
             for j in range(len(self.attribute_indexes[i])):
                 heading_value = "att" + str(j)
                 heading.append(heading_value)
                 test_sub_set.append(test[self.attribute_indexes[i][j]])
             temp.append(myutils.tdidt_predict(heading, tree, test_sub_set))
         all_predictions.append(temp)
     for item in all_predictions:
         y_predicted.append(myutils.forest_majority_voting(item))
     return y_predicted
Ejemplo n.º 3
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        header = ['att' + str(i) for i in range(len(X_train[0]))]
        attribute_domains = {}
        for i, val in enumerate(header):
            attribute_domains[val] = myutils.unique_index(X_train, i)

        self.X_train = X_train
        self.y_train = y_train
        sample_X_train, sample_x_test, sample_y_train, sample_y_test = myevaluation.train_test_split(
            X_train, y_train, test_size=0.33, shuffle=True)
        train = [
            sample_X_train[i] + [sample_y_train[i]]
            for i in range(len(sample_X_train))
        ]

        for _ in range(self.N):
            available_attributes = header.copy()
            self.trees.append(
                myutils.tdidt_forest(
                    myutils.compute_bootstrapped_sample(train),
                    available_attributes, attribute_domains, header, self.F))

        accuracies = []
        for tree in self.trees:
            header = ['att' + str(i) for i in range(len(sample_x_test[0]))]
            prediction = []
            for row in sample_x_test:
                prediction.append(myutils.tdidt_predict(header, tree, row))
            accuracy = 0
            for i in range(len(prediction)):
                if prediction[i] == sample_y_test[i]:
                    accuracy += 1
            accuracy /= len(sample_y_test)
            accuracies.append([accuracy])
        # find m most accurate
        m_trees = []
        for i in range(len(accuracies)):
            accuracies[i].append(i)
        accuracies = sorted(accuracies)
        for i in range(self.M):
            m_trees.append(self.trees[accuracies[-(i + 1)][1]])
        self.trees = m_trees
Ejemplo n.º 4
0
    def fit(self, X_train, y_train, user_F, user_N, user_M, random_state=None):
        """Fits a random forest classifier to X_train and y_train.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        """
        if random_state is not None:
            # store seed
            self.random_state = random_state
            np.random.seed(self.random_state)
        self.X_train = X_train
        self.y_train = y_train
        self.F = user_F
        self.N = user_N
        self.M = user_M
        stratified_test, stratified_remainder = myevaluation.random_stratified_test_remainder_set(
            X_train, y_train, random_state)
        train = myutils.stitch_x_and_y_trains(X_train, y_train)
        attribute_domains = myutils.calculate_attribute_domains(
            train)  # TODO: think about if this should be X_train or "train"
        N_forest = []

        for i in range(self.N):
            bootstrapped_table = myutils.bootstrap(stratified_remainder,
                                                   random_state)
            available_attributes = myutils.get_generic_header(
                bootstrapped_table
            )  # TODO: check that this is used for only X_trains
            tree = myutils.tdidt(bootstrapped_table, available_attributes,
                                 attribute_domains, self.F)
            N_forest.append(tree)
        header = myutils.get_generic_header(stratified_remainder)
        header.append("y")
        y_predicted = []
        y_true = []
        all_accuracies = []
        # testing accuracy of N_forest trees to find the top M accuracies
        for tree in N_forest:
            y_predicted_row = []
            for item in stratified_test:
                y_predicted_row.append(
                    myutils.tdidt_predict(header, tree, item[:-1]))
            y_predicted.append(y_predicted_row)

        y_true = myutils.get_column(stratified_test, header, "y")
        for predicted_sublist in y_predicted:
            accuracy, _ = myutils.accuracy_errorrate(predicted_sublist, y_true)
            all_accuracies.append(accuracy)
        for _ in range(self.M):
            max_ind = all_accuracies.index(max(all_accuracies))
            self.forest.append(N_forest[max_ind])
            all_accuracies[max_ind] = -1
Ejemplo n.º 5
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of obj): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """
        header = ['att' + str(i) for i in range(len(X_test[0]))]
        res = []
        for row in X_test:
            res.append(myutils.tdidt_predict(header, self.tree, row))
        return res
 def predict(self, X_test):
     """Makes predictions for test instances in X_test.
     Args:
         X_test(list of list of obj): The list of testing samples
             The shape of X_test is (n_test_samples, n_features)
     Returns:
         y_predicted(list of obj): The predicted target y values (parallel to X_test)
     """
     heading = []
     y_predicted = []
     for i in range(len(self.X_train[0])):
         heading_value = "att" + str(i)
         heading.append(heading_value)
     for test in X_test:
         y_predicted.append(myutils.tdidt_predict(heading, self.tree, test))
     return y_predicted  # TODO: fix this
Ejemplo n.º 7
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of obj): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """
        header = []
        predictions = []
        for i in range(0, len(X_test[0])):
            header.append("att" + str(i))
        for instance in X_test:
            prediction = myutils.tdidt_predict(header, self.tree, instance)
            predictions.append(prediction)
        return predictions
Ejemplo n.º 8
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of obj): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """

        train = myutils.stitch_x_and_y_trains(self.X_train, self.y_train)
        header = myutils.get_generic_header(train)
        y_predicted = []
        for item in X_test:
            y_predicted.append(myutils.tdidt_predict(header, self.tree, item))

        return y_predicted
Ejemplo n.º 9
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of obj): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """
        # Create y_predicted
        y_predicted = []
        # Set the header
        header = ['att' + str(i) for i in range(len(self.X_train[0]))]
        # Traverse the X_test
        for test in X_test:
            # Call predict on test and append that to y_predicted
            y_predicted.append(myutils.tdidt_predict(header, self.tree, test))
        # Return y_predicted
        return y_predicted
Ejemplo n.º 10
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of obj): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """
        # APIServiceFun interview_app.py
        #lecture 4/8
        y_predicted = []
        header = myutils.build_header(self.X_train)
    
        for instance in X_test:
            y_predicted.append(myutils.tdidt_predict(header, self.tree, instance))
        
        # print(y_predicted)

        return y_predicted # TODO: fix this
Ejemplo n.º 11
0
    def fit(self, X_train, y_train, M=7, N=20, F=2):
        """Fits a random forest classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        """
        self.X_train = copy.deepcopy(X_train)
        self.y_train = copy.deepcopy(y_train)

        # create random stratified test set with 2:1 ratio
        X_remainder, X_test, y_remainder, y_test = myevaluation.train_test_split(
            copy.deepcopy(X_train), copy.deepcopy(y_train))

        for i, x in enumerate(y_remainder):
            X_remainder[i].append(x)
        for i, x in enumerate(y_test):
            X_test[i].append(x)
        # generate N random decision trees using bagging
        trees = []
        for i in range(N):
            # print(i)
            # print("getting sample and validation sets...")
            # get the sample and validation sets
            sample = myutils.compute_bootstrapped_sample(X_remainder)
            validation_set = []
            for x in X_remainder:
                if x not in sample:
                    validation_set.append(x)
            # print("length of sample and validation sets:", len(sample), len(validation_set))
            # print("getting the tree...")
            # get the tree from the sample
            available_attributes = myutils.get_available_attributes(sample)
            tree = myutils.tdidt_random_forest(
                sample, [x for x in range(0,
                                          len(sample[0]) - 1)],
                available_attributes, F)

            # print("testing the tree")
            # test against the validation set
            validation_set_x = [x[:-1] for x in validation_set]
            validation_set_y = [x[-1] for x in validation_set]
            predictions = []
            header = []
            for i in range(0, len(validation_set_x[0])):
                header.append("att" + str(i))
            for x, y in zip(validation_set_x, validation_set_y):
                prediction = myutils.tdidt_predict(header, tree, x)
                predictions.append(int(prediction == y))

            # print("accuracy:", sum(predictions)/len(predictions))
            trees.append({
                "accuracy": sum(predictions) / len(predictions),
                "tree": tree
            })

        # print("getting the best M trees")
        # get the best M of N trees
        trees = sorted(trees, key=lambda k: k["accuracy"], reverse=True)
        self.trees = trees[:M]