def fit(self, X_train, y_train):
     """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.
     Args:
         X_train(list of list of obj): The list of training instances (samples). 
             The shape of X_train is (n_train_samples, n_features)
         y_train(list of obj): The target y values (parallel to X_train)
             The shape of y_train is n_train_samples
     Notes:
         Since TDIDT is an eager learning algorithm, this method builds a decision tree model
             from the training data.
         Build a decision tree using the nested list representation described in class.
         Store the tree in the tree attribute.
         Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
     """
     heading = []
     for i in range(len(X_train[0])):
         heading_value = "att" + str(i)
         heading.append(heading_value)
     att_domain = {}
     for item in heading:
         values, counts = myutils.get_table_frequencies(
             X_train, heading, item)
         att_domain[item] = values
     # stitch together X_train and y_train
     train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
     available_attributes = heading.copy()  #  pass by object reference
     # call tdidt()
     tree = myutils.tdidt(train, available_attributes, heading, att_domain)
     self.X_train = X_train
     self.y_train = y_train
     self.tree = tree
     pass  # TODO: fix this
Exemple #2
0
    def fit(self, X_train, y_train, random_forest=False, F=2):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        self.X_train = X_train
        self.y_train = y_train
        # compute a "header" ["att0", "att1", ...]
        header = myutils.build_header(X_train)
        # compute the attribute domains dictionary
        attr_domains = myutils.get_attr_domains(X_train, header)
        # my advice is to stitch together X_train and y_train
        train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        # initial call to tdidt current instances is the whole table (train)
        available_attributes = header.copy()  # python is pass object reference
        if random_forest:
            self.tree = myutils.tdidt_random(train, available_attributes,
                                             attr_domains, header, F)
        else:
            self.tree = myutils.tdidt(train, available_attributes,
                                      attr_domains, header)
        # print("tree:", self.tree)
        pass  # TODO: fix this
Exemple #3
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        ##fit() accepts X_train and y_train
        # # TODO: calculate the attribute domains dictionary
        # # TODO: calculate a header (e.g. ["att0", "att1", ...])
        # # my advice: stitch together X_train and y_train
        # train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        # available_attributes = header.copy() # recall: Python is pass
        # # by object reference
        # # initial tdidt() call
        self.X_train = X_train
        self.y_train = y_train
        train = myutils.stitch_x_and_y_trains(X_train, y_train)
        available_attributes = myutils.get_generic_header(
            X_train)  # TODO: check that this is used for only X_trains
        attribute_domains = myutils.calculate_attribute_domains(
            X_train)  # TODO: think about if this should be X_train or "train"

        self.tree = myutils.tdidt(train, available_attributes,
                                  attribute_domains, None)
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        header = ['att' + str(i) for i in range(len(X_train[0]))]
        attribute_domains = {}
        for i, val in enumerate(header):
            attribute_domains[val] = myutils.unique_index(X_train, i)

        train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        available_attributes = header.copy()
        self.X_train = X_train
        self.y_train = y_train
        self.tree = myutils.tdidt(train, available_attributes,
                                  attribute_domains, header)
Exemple #5
0
    def fit(self, X_train, y_train, user_F, user_N, user_M, random_state=None):
        """Fits a random forest classifier to X_train and y_train.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        """
        if random_state is not None:
            # store seed
            self.random_state = random_state
            np.random.seed(self.random_state)
        self.X_train = X_train
        self.y_train = y_train
        self.F = user_F
        self.N = user_N
        self.M = user_M
        stratified_test, stratified_remainder = myevaluation.random_stratified_test_remainder_set(
            X_train, y_train, random_state)
        train = myutils.stitch_x_and_y_trains(X_train, y_train)
        attribute_domains = myutils.calculate_attribute_domains(
            train)  # TODO: think about if this should be X_train or "train"
        N_forest = []

        for i in range(self.N):
            bootstrapped_table = myutils.bootstrap(stratified_remainder,
                                                   random_state)
            available_attributes = myutils.get_generic_header(
                bootstrapped_table
            )  # TODO: check that this is used for only X_trains
            tree = myutils.tdidt(bootstrapped_table, available_attributes,
                                 attribute_domains, self.F)
            N_forest.append(tree)
        header = myutils.get_generic_header(stratified_remainder)
        header.append("y")
        y_predicted = []
        y_true = []
        all_accuracies = []
        # testing accuracy of N_forest trees to find the top M accuracies
        for tree in N_forest:
            y_predicted_row = []
            for item in stratified_test:
                y_predicted_row.append(
                    myutils.tdidt_predict(header, tree, item[:-1]))
            y_predicted.append(y_predicted_row)

        y_true = myutils.get_column(stratified_test, header, "y")
        for predicted_sublist in y_predicted:
            accuracy, _ = myutils.accuracy_errorrate(predicted_sublist, y_true)
            all_accuracies.append(accuracy)
        for _ in range(self.M):
            max_ind = all_accuracies.index(max(all_accuracies))
            self.forest.append(N_forest[max_ind])
            all_accuracies[max_ind] = -1
Exemple #6
0
    def fit(self, X_train, y_train, is_forest=False):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
            is_forest(bool): determines whether this fit was called by MyRandomForestClassifier

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        # Define variables
        self.X_train = X_train
        self.y_train = y_train

        # Compute a "header" ["att0", "att1", ...]
        header = []
        for i,value in enumerate(X_train[0]):
            header.append("att{}".format(i))

        #print(header)

        # Compute the attribute domains dictionary
        attribute_domains = {}
        for i in range(len(header)):
            attribute_domains[header[i]] = []
            for j in range(len(X_train)):
                #print(X_train[j][i])
                try:
                    attribute_domains[header[i]].index(X_train[j][i])
                    #found
                except:
                    #not found
                    attribute_domains[header[i]].append(X_train[j][i])
            attribute_domains[header[i]].sort()

        #print(attribute_domains)

        # my advice is to stitch together X_train and y_train
        train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        # initial call to tdidt current instances is the whole table (train)
        available_attributes = header.copy() # python is pass object reference
        self.tree = myutils.tdidt(train, available_attributes, attribute_domains, is_forest)
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        # attribute_domains, header = myutils.compute_att_domains(X_train)
        # train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        # available_attributes = header.copy()
        # self.tree = myutils.tdidt(train, available_attributes, attribute_domains, header)
        self.X_train = X_train
        self.y_train = y_train
        header = ['att' + str(i)
                  for i in range(len(self.X_train[0]))]  # Computing headers
        del self.X_train[0]
        del self.y_train[0]

        # Computing attribute domains
        attribute_domains = {}
        for i, h in enumerate(header):
            attribute_domains[h] = []
            for x in self.X_train:
                if x[i] not in attribute_domains[h]:
                    attribute_domains[h].append(x[i])

        for k, v in attribute_domains.items():
            attribute_domains[k] = sorted(v)

        training_set = [
            self.X_train[i] + [self.y_train[i]]
            for i in range(len(self.X_train))
        ]
        # initial call to tdidt current instances is the whole table (train)
        available_attributes = header.copy()  # python is pass object reference
        self.tree = myutils.tdidt(training_set, available_attributes,
                                  attribute_domains, header)
Exemple #8
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """

        self.X_train = X_train
        self.y_train = y_train

        # Calculate headers (e.g. ["att0", "att1", ...])
        headers = []
        for col_index in range(len(X_train[0])):
            headers.append("att" + str(col_index))

        # Calculate the attribute domains dictionary (e.g. standing can be 1 or 2)
        domains = {}
        for col_index in range(len(X_train[0])):
            att_values = []
            for row in X_train:
                if not row[col_index] in att_values:
                    att_values.append(row[col_index])
            att_values.sort()  # Put them in alphabetical order
            domains[headers[col_index]] = att_values

        # Stitch together X_train and y_train
        train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        available_attributes = headers.copy()

        # Initial tdidt() call
        tree = myutils.tdidt(train, available_attributes, headers, domains)
        self.tree = tree
Exemple #9
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
         # fit() accepts X_train and y_train
        # TODO: calculate the attribute domains dictionary

        header = []
        attribute_domains = {}
        
        #loops through X_train and creates header
        for i in range(len(X_train[0])) :
            header.append("att" + str(i))

        #loops though header to form attribute domains dictionairy
        count = 0
        for item in header:
            curr_col = myutils.get_column(X_train, count)
            values, counts = myutils.get_frequencies(curr_col)
            attribute_domains[item] = values
            count+=1

        #stitching together X_train and y_train and getting available attributes
        train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        available_attributes = header.copy()
    
        #forming tree
        self.tree = myutils.tdidt(train, available_attributes, attribute_domains, header)
        self.print_decision_rules()
Exemple #10
0
    def fit(self, X_train, y_train, allowed_attributes=None):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        # Set X_train and y_train
        self.X_train = X_train
        self.y_train = y_train
        # Set the header
        header = []
        # Delete header from X_train
        # Make a copy of the header
        if allowed_attributes is not None:
            header = allowed_attributes
        else:
            header = ['att' + str(i) for i in range(len(self.X_train[0]))]
        del self.X_train[0]
        available_attributes = header.copy()
        # Get the attribute domains
        attribute_domains = myutils.get_attribute_domains(self.X_train, header)
        # Create the train
        train = [
            self.X_train[i] + [self.y_train[i]]
            for i in range(len(self.X_train))
        ]
        # Call tdidt to set the tree
        self.tree = myutils.tdidt(train, available_attributes,
                                  attribute_domains, header)
    def fit(self, X_train, y_train, attSubsetSize = None):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        if attSubsetSize is None:
            attSubsetSize = len(X_train[1])

        numAtts = len(X_train[1])
        # Create a header
        header = []
        for i in range(numAtts):
            header.append("att" + str(i))
        
        # determine the attribute domain
        attDomain = {}
        
        for i, att in enumerate(header):
            attDomain[att] = myutils.getUniqueIdentifiers(X_train, i)
        
        # stitch together x and y train
        train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        availableAtts = header.copy()

        tree = myutils.tdidt(train, availableAtts, attDomain, header, attSubsetSize)
        self.tree = tree
Exemple #12
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        self.X_train = copy.deepcopy(X_train)
        self.y_train = copy.deepcopy(y_train)

        X_train2 = copy.deepcopy(X_train)
        # construct a dictionary og possible values in the form {attribute: values}
        available_attributes = {}
        for i in range(0, len(X_train[0])):
            att = "att" + str(i)
            available_attributes[att] = []
            for x in X_train:
                if x[i] not in available_attributes[att]:
                    available_attributes[att].append(x[i])

        for i, x in enumerate(y_train):
            X_train2[i].append(x)
        tree = myutils.tdidt(X_train2,
                             [x for x in range(0,
                                               len(X_train2[0]) - 1)],
                             available_attributes)
        self.tree = tree