Esempio n. 1
0
    def fit(self, X_train, y_train, user_F, user_N, user_M, random_state=None):
        """Fits a random forest classifier to X_train and y_train.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        """
        if random_state is not None:
            # store seed
            self.random_state = random_state
            np.random.seed(self.random_state)
        self.X_train = X_train
        self.y_train = y_train
        self.F = user_F
        self.N = user_N
        self.M = user_M
        stratified_test, stratified_remainder = myevaluation.random_stratified_test_remainder_set(
            X_train, y_train, random_state)
        train = myutils.stitch_x_and_y_trains(X_train, y_train)
        attribute_domains = myutils.calculate_attribute_domains(
            train)  # TODO: think about if this should be X_train or "train"
        N_forest = []

        for i in range(self.N):
            bootstrapped_table = myutils.bootstrap(stratified_remainder,
                                                   random_state)
            available_attributes = myutils.get_generic_header(
                bootstrapped_table
            )  # TODO: check that this is used for only X_trains
            tree = myutils.tdidt(bootstrapped_table, available_attributes,
                                 attribute_domains, self.F)
            N_forest.append(tree)
        header = myutils.get_generic_header(stratified_remainder)
        header.append("y")
        y_predicted = []
        y_true = []
        all_accuracies = []
        # testing accuracy of N_forest trees to find the top M accuracies
        for tree in N_forest:
            y_predicted_row = []
            for item in stratified_test:
                y_predicted_row.append(
                    myutils.tdidt_predict(header, tree, item[:-1]))
            y_predicted.append(y_predicted_row)

        y_true = myutils.get_column(stratified_test, header, "y")
        for predicted_sublist in y_predicted:
            accuracy, _ = myutils.accuracy_errorrate(predicted_sublist, y_true)
            all_accuracies.append(accuracy)
        for _ in range(self.M):
            max_ind = all_accuracies.index(max(all_accuracies))
            self.forest.append(N_forest[max_ind])
            all_accuracies[max_ind] = -1
def random_stratified_test_remainder_set(X, y, random_state, set_size=0.33):
    # does not seem random?
    if random_state is not None:
        # store seed
        random_state = random_state
        np.random.seed(random_state)

    randomize_in_place(X, y)

    stitched_table = myutils.stitch_x_and_y_trains(X, y)
    header = myutils.get_generic_header(stitched_table)
    header.append("y")
    names, group_subtables = myutils.group_by_value(stitched_table, header,
                                                    "y")
    bins = []
    for _ in range(3):
        bins.append([])

    # split data into bins
    index = 0
    for subtable in group_subtables:
        for row in subtable:
            bins[index].append(row)
            index = (index + 1) % 3
    X_train = bins[0]
    X_remainder = bins[1] + bins[2]
    return X_train, X_remainder
Esempio n. 3
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        ##fit() accepts X_train and y_train
        # # TODO: calculate the attribute domains dictionary
        # # TODO: calculate a header (e.g. ["att0", "att1", ...])
        # # my advice: stitch together X_train and y_train
        # train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        # available_attributes = header.copy() # recall: Python is pass
        # # by object reference
        # # initial tdidt() call
        self.X_train = X_train
        self.y_train = y_train
        train = myutils.stitch_x_and_y_trains(X_train, y_train)
        available_attributes = myutils.get_generic_header(
            X_train)  # TODO: check that this is used for only X_trains
        attribute_domains = myutils.calculate_attribute_domains(
            X_train)  # TODO: think about if this should be X_train or "train"

        self.tree = myutils.tdidt(train, available_attributes,
                                  attribute_domains, None)
Esempio n. 4
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of obj): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """

        train = myutils.stitch_x_and_y_trains(self.X_train, self.y_train)
        header = myutils.get_generic_header(train)
        y_predicted = []
        for item in X_test:
            y_predicted.append(myutils.tdidt_predict(header, self.tree, item))

        return y_predicted