def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ heading = [] for i in range(len(X_train[0])): heading_value = "att" + str(i) heading.append(heading_value) att_domain = {} for item in heading: values, counts = myutils.get_table_frequencies( X_train, heading, item) att_domain[item] = values # stitch together X_train and y_train train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] available_attributes = heading.copy() # pass by object reference # call tdidt() tree = myutils.tdidt(train, available_attributes, heading, att_domain) self.X_train = X_train self.y_train = y_train self.tree = tree pass # TODO: fix this
def fit(self, X_train, y_train, random_forest=False, F=2): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ self.X_train = X_train self.y_train = y_train # compute a "header" ["att0", "att1", ...] header = myutils.build_header(X_train) # compute the attribute domains dictionary attr_domains = myutils.get_attr_domains(X_train, header) # my advice is to stitch together X_train and y_train train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] # initial call to tdidt current instances is the whole table (train) available_attributes = header.copy() # python is pass object reference if random_forest: self.tree = myutils.tdidt_random(train, available_attributes, attr_domains, header, F) else: self.tree = myutils.tdidt(train, available_attributes, attr_domains, header) # print("tree:", self.tree) pass # TODO: fix this
def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ ##fit() accepts X_train and y_train # # TODO: calculate the attribute domains dictionary # # TODO: calculate a header (e.g. ["att0", "att1", ...]) # # my advice: stitch together X_train and y_train # train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] # available_attributes = header.copy() # recall: Python is pass # # by object reference # # initial tdidt() call self.X_train = X_train self.y_train = y_train train = myutils.stitch_x_and_y_trains(X_train, y_train) available_attributes = myutils.get_generic_header( X_train) # TODO: check that this is used for only X_trains attribute_domains = myutils.calculate_attribute_domains( X_train) # TODO: think about if this should be X_train or "train" self.tree = myutils.tdidt(train, available_attributes, attribute_domains, None)
def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ header = ['att' + str(i) for i in range(len(X_train[0]))] attribute_domains = {} for i, val in enumerate(header): attribute_domains[val] = myutils.unique_index(X_train, i) train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] available_attributes = header.copy() self.X_train = X_train self.y_train = y_train self.tree = myutils.tdidt(train, available_attributes, attribute_domains, header)
def fit(self, X_train, y_train, user_F, user_N, user_M, random_state=None): """Fits a random forest classifier to X_train and y_train. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples """ if random_state is not None: # store seed self.random_state = random_state np.random.seed(self.random_state) self.X_train = X_train self.y_train = y_train self.F = user_F self.N = user_N self.M = user_M stratified_test, stratified_remainder = myevaluation.random_stratified_test_remainder_set( X_train, y_train, random_state) train = myutils.stitch_x_and_y_trains(X_train, y_train) attribute_domains = myutils.calculate_attribute_domains( train) # TODO: think about if this should be X_train or "train" N_forest = [] for i in range(self.N): bootstrapped_table = myutils.bootstrap(stratified_remainder, random_state) available_attributes = myutils.get_generic_header( bootstrapped_table ) # TODO: check that this is used for only X_trains tree = myutils.tdidt(bootstrapped_table, available_attributes, attribute_domains, self.F) N_forest.append(tree) header = myutils.get_generic_header(stratified_remainder) header.append("y") y_predicted = [] y_true = [] all_accuracies = [] # testing accuracy of N_forest trees to find the top M accuracies for tree in N_forest: y_predicted_row = [] for item in stratified_test: y_predicted_row.append( myutils.tdidt_predict(header, tree, item[:-1])) y_predicted.append(y_predicted_row) y_true = myutils.get_column(stratified_test, header, "y") for predicted_sublist in y_predicted: accuracy, _ = myutils.accuracy_errorrate(predicted_sublist, y_true) all_accuracies.append(accuracy) for _ in range(self.M): max_ind = all_accuracies.index(max(all_accuracies)) self.forest.append(N_forest[max_ind]) all_accuracies[max_ind] = -1
def fit(self, X_train, y_train, is_forest=False): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples is_forest(bool): determines whether this fit was called by MyRandomForestClassifier Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ # Define variables self.X_train = X_train self.y_train = y_train # Compute a "header" ["att0", "att1", ...] header = [] for i,value in enumerate(X_train[0]): header.append("att{}".format(i)) #print(header) # Compute the attribute domains dictionary attribute_domains = {} for i in range(len(header)): attribute_domains[header[i]] = [] for j in range(len(X_train)): #print(X_train[j][i]) try: attribute_domains[header[i]].index(X_train[j][i]) #found except: #not found attribute_domains[header[i]].append(X_train[j][i]) attribute_domains[header[i]].sort() #print(attribute_domains) # my advice is to stitch together X_train and y_train train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] # initial call to tdidt current instances is the whole table (train) available_attributes = header.copy() # python is pass object reference self.tree = myutils.tdidt(train, available_attributes, attribute_domains, is_forest)
def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ # attribute_domains, header = myutils.compute_att_domains(X_train) # train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] # available_attributes = header.copy() # self.tree = myutils.tdidt(train, available_attributes, attribute_domains, header) self.X_train = X_train self.y_train = y_train header = ['att' + str(i) for i in range(len(self.X_train[0]))] # Computing headers del self.X_train[0] del self.y_train[0] # Computing attribute domains attribute_domains = {} for i, h in enumerate(header): attribute_domains[h] = [] for x in self.X_train: if x[i] not in attribute_domains[h]: attribute_domains[h].append(x[i]) for k, v in attribute_domains.items(): attribute_domains[k] = sorted(v) training_set = [ self.X_train[i] + [self.y_train[i]] for i in range(len(self.X_train)) ] # initial call to tdidt current instances is the whole table (train) available_attributes = header.copy() # python is pass object reference self.tree = myutils.tdidt(training_set, available_attributes, attribute_domains, header)
def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ self.X_train = X_train self.y_train = y_train # Calculate headers (e.g. ["att0", "att1", ...]) headers = [] for col_index in range(len(X_train[0])): headers.append("att" + str(col_index)) # Calculate the attribute domains dictionary (e.g. standing can be 1 or 2) domains = {} for col_index in range(len(X_train[0])): att_values = [] for row in X_train: if not row[col_index] in att_values: att_values.append(row[col_index]) att_values.sort() # Put them in alphabetical order domains[headers[col_index]] = att_values # Stitch together X_train and y_train train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] available_attributes = headers.copy() # Initial tdidt() call tree = myutils.tdidt(train, available_attributes, headers, domains) self.tree = tree
def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ # fit() accepts X_train and y_train # TODO: calculate the attribute domains dictionary header = [] attribute_domains = {} #loops through X_train and creates header for i in range(len(X_train[0])) : header.append("att" + str(i)) #loops though header to form attribute domains dictionairy count = 0 for item in header: curr_col = myutils.get_column(X_train, count) values, counts = myutils.get_frequencies(curr_col) attribute_domains[item] = values count+=1 #stitching together X_train and y_train and getting available attributes train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] available_attributes = header.copy() #forming tree self.tree = myutils.tdidt(train, available_attributes, attribute_domains, header) self.print_decision_rules()
def fit(self, X_train, y_train, allowed_attributes=None): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ # Set X_train and y_train self.X_train = X_train self.y_train = y_train # Set the header header = [] # Delete header from X_train # Make a copy of the header if allowed_attributes is not None: header = allowed_attributes else: header = ['att' + str(i) for i in range(len(self.X_train[0]))] del self.X_train[0] available_attributes = header.copy() # Get the attribute domains attribute_domains = myutils.get_attribute_domains(self.X_train, header) # Create the train train = [ self.X_train[i] + [self.y_train[i]] for i in range(len(self.X_train)) ] # Call tdidt to set the tree self.tree = myutils.tdidt(train, available_attributes, attribute_domains, header)
def fit(self, X_train, y_train, attSubsetSize = None): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ if attSubsetSize is None: attSubsetSize = len(X_train[1]) numAtts = len(X_train[1]) # Create a header header = [] for i in range(numAtts): header.append("att" + str(i)) # determine the attribute domain attDomain = {} for i, att in enumerate(header): attDomain[att] = myutils.getUniqueIdentifiers(X_train, i) # stitch together x and y train train = [X_train[i] + [y_train[i]] for i in range(len(X_train))] availableAtts = header.copy() tree = myutils.tdidt(train, availableAtts, attDomain, header, attSubsetSize) self.tree = tree
def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ self.X_train = copy.deepcopy(X_train) self.y_train = copy.deepcopy(y_train) X_train2 = copy.deepcopy(X_train) # construct a dictionary og possible values in the form {attribute: values} available_attributes = {} for i in range(0, len(X_train[0])): att = "att" + str(i) available_attributes[att] = [] for x in X_train: if x[i] not in available_attributes[att]: available_attributes[att].append(x[i]) for i, x in enumerate(y_train): X_train2[i].append(x) tree = myutils.tdidt(X_train2, [x for x in range(0, len(X_train2[0]) - 1)], available_attributes) self.tree = tree