def fit(self, X_train, y_train, user_F, user_N, user_M, random_state=None): """Fits a random forest classifier to X_train and y_train. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples """ if random_state is not None: # store seed self.random_state = random_state np.random.seed(self.random_state) self.X_train = X_train self.y_train = y_train self.F = user_F self.N = user_N self.M = user_M stratified_test, stratified_remainder = myevaluation.random_stratified_test_remainder_set( X_train, y_train, random_state) train = myutils.stitch_x_and_y_trains(X_train, y_train) attribute_domains = myutils.calculate_attribute_domains( train) # TODO: think about if this should be X_train or "train" N_forest = [] for i in range(self.N): bootstrapped_table = myutils.bootstrap(stratified_remainder, random_state) available_attributes = myutils.get_generic_header( bootstrapped_table ) # TODO: check that this is used for only X_trains tree = myutils.tdidt(bootstrapped_table, available_attributes, attribute_domains, self.F) N_forest.append(tree) header = myutils.get_generic_header(stratified_remainder) header.append("y") y_predicted = [] y_true = [] all_accuracies = [] # testing accuracy of N_forest trees to find the top M accuracies for tree in N_forest: y_predicted_row = [] for item in stratified_test: y_predicted_row.append( myutils.tdidt_predict(header, tree, item[:-1])) y_predicted.append(y_predicted_row) y_true = myutils.get_column(stratified_test, header, "y") for predicted_sublist in y_predicted: accuracy, _ = myutils.accuracy_errorrate(predicted_sublist, y_true) all_accuracies.append(accuracy) for _ in range(self.M): max_ind = all_accuracies.index(max(all_accuracies)) self.forest.append(N_forest[max_ind]) all_accuracies[max_ind] = -1
def random_stratified_test_remainder_set(X, y, random_state, set_size=0.33): # does not seem random? if random_state is not None: # store seed random_state = random_state np.random.seed(random_state) randomize_in_place(X, y) stitched_table = myutils.stitch_x_and_y_trains(X, y) header = myutils.get_generic_header(stitched_table) header.append("y") names, group_subtables = myutils.group_by_value(stitched_table, header, "y") bins = [] for _ in range(3): bins.append([]) # split data into bins index = 0 for subtable in group_subtables: for row in subtable: bins[index].append(row) index = (index + 1) % 3 X_train = bins[0] X_remainder = bins[1] + bins[2] return X_train, X_remainder
def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ ##fit() accepts X_train and y_train # # TODO: calculate the attribute domains dictionary # # TODO: calculate a header (e.g. ["att0", "att1", ...]) # # my advice: stitch together X_train and y_train # train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] # available_attributes = header.copy() # recall: Python is pass # # by object reference # # initial tdidt() call self.X_train = X_train self.y_train = y_train train = myutils.stitch_x_and_y_trains(X_train, y_train) available_attributes = myutils.get_generic_header( X_train) # TODO: check that this is used for only X_trains attribute_domains = myutils.calculate_attribute_domains( X_train) # TODO: think about if this should be X_train or "train" self.tree = myutils.tdidt(train, available_attributes, attribute_domains, None)
def predict(self, X_test): """Makes predictions for test instances in X_test. Args: X_test(list of list of obj): The list of testing samples The shape of X_test is (n_test_samples, n_features) Returns: y_predicted(list of obj): The predicted target y values (parallel to X_test) """ train = myutils.stitch_x_and_y_trains(self.X_train, self.y_train) header = myutils.get_generic_header(train) y_predicted = [] for item in X_test: y_predicted.append(myutils.tdidt_predict(header, self.tree, item)) return y_predicted