Beispiel #1
0
    def construct_DT(self, X, y):
        self.depth += self.depth
        if len(y) < self.leafsize or self.depth > 25 or len(set(y)) == 1:
            return [-1, int(round(np.mean(y))), -1, -1]
        min = -1.0
        for index in range(len(X[0])):
            if isinstance(X[0][index], Number):
                val = np.mean([x[index] for x in X])
                l_x, r_x, l_y, r_y = partition_classes(X, y, index, val)
                t_gain = information_gain(y, [l_y, r_y])
                if t_gain > min:
                    min = t_gain
                    split_val = val
                    split_attr = index
            else:
                lv = set([x[index] for x in X])
                for val in lv:
                    lx, rx, ly, ry = partition_classes(X, y, index, val)
                    t_gain = information_gain(y, [l_y, r_y])
                    if t_gain > min:
                        min = t_gain
                        split_val = val
                        split_attr = index

        x_l, x_r, y_l, y_r = partition_classes(X, y, split_attr, split_val)
        tl = self.construct_DT(x_l, y_l)
        rl = self.construct_DT(x_r, y_r)
        return [split_attr, split_val, tl, rl]
Beispiel #2
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        if (len(y) == 0):
            self.tree['type'] = 'empty'
            return

        cnt = np.unique(y, return_counts=True)[1]
        if len(cnt) == 1:
            self.tree['type'] = 'leaf'
            self.tree['y'] = np.argmax(cnt)
            return

        max_gain = 0
        max_gain_attr = 0
        max_gain_index = 0

        for i in range(len(X[0])):
            if isinstance(X[0][i], str):
                cat = list(set([x[i] for x in X]))
                for j in cat:
                    xl, xr, yl, yr = partition_classes(X, y, i, j)
                    temp_gain = information_gain(y, [yl, yr])
                    if temp_gain > max_gain:
                        max_gain = temp_gain
                        max_gain_attr = j
                        max_gain_index = i
            else:
                cat = [x[i] for x in X]
                cat_mean = sum(cat) / len(cat)
                xl, xr, yl, yr = partition_classes(X, y, i, cat_mean)
                temp_gain = information_gain(y, [yl, yr])
                if temp_gain > max_gain:
                    max_gain = temp_gain
                    max_gain_attr = cat_mean
                    max_gain_index = i

        Xl, Xr, yl, yr = partition_classes(X, y, max_gain_index, max_gain_attr)

        left_subtree = DecisionTree()
        right_subtree = DecisionTree()
        left_subtree.learn(Xl, yl)
        right_subtree.learn(Xr, yr)
        self.tree['type'] = 'node'
        self.tree['left'] = left_subtree
        self.tree['right'] = right_subtree
        self.tree['split_attr'] = max_gain_index
        self.tree['split_val'] = max_gain_attr

        pass
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)

        root = {
            'attribute_index': None,
            'value': None,
            'left': None,
            'right': None
        }

        if entropy(
                y
        ) < .2:  ##Reasonable bound/cut off based on entropy to prevent overfitting
            return sp.stats.mode(y)[0][0]

        else:
            info_gain = 0
            split_attribute_index = 0
            split_value = X[0][split_attribute_index]

            for item in X:
                for i in range(len(item)):
                    if information_gain(y, [
                            partition_classes(X, y, i, item[i])[2],
                            partition_classes(X, y, i, item[i])[3]
                    ]) > info_gain:
                        info_gain = information_gain(
                            y, [
                                partition_classes(X, y, i, item[i])[2],
                                partition_classes(X, y, i, item[i])[3]
                            ]
                        )  ##Base attribute and split value on combination that maximizes info gain
                        split_attribute_index = i
                        split_value = item[i]

            root['attribute_index'] = split_attribute_index
            root['value'] = split_value
            root['left'] = self.learn(
                partition_classes(X, y, split_attribute_index, split_value)[0],
                partition_classes(X, y, split_attribute_index, split_value)[2])
            root['right'] = self.learn(
                partition_classes(X, y, split_attribute_index, split_value)[1],
                partition_classes(X, y, split_attribute_index, split_value)[3])

            self.tree.insert(0, root)

            return root
Beispiel #4
0
        def treeBuilding(X, y):

            if sum(y) == len(y) or sum(y) == 0:
                return classLabel(y)

            tree = {}
            cols = len(X[0])
            #tree['entropy'] = entropy(y)
            #tree['num_rows'] = len(X)
            #tree['num_one'] = sum(y)
            #tree['num_zero'] = len(y)-sum(y)

            info_gain_list = []
            split_val_list = []

            for idx in range(cols):
                best_val = 0
                best_gain = 0
                col = [row[idx] for row in X]
                if isinstance(col[0], str):
                    val_set = set(col)
                    for val in val_set:
                        X_left, X_right, y_left, y_right = partition_classes(X, y, idx, val)
                        if len(y_left) == 0 or len(y_right) == 0:
                            break

                        gain = information_gain(y, [y_left, y_right])
                        if gain > best_gain:
                            best_gain = gain
                            best_val = val
                else:
                    steps = np.linspace(start=np.min(col), stop=np.max(col), num=5, endpoint=False)[1:]
                    for val in steps:
                        X_left, X_right, y_left, y_right = partition_classes(X, y, idx, val)
                        if len(y_left) == 0 or len(y_right) == 0:
                            break
                        gain = information_gain(y, [y_left, y_right])
                        if gain > best_gain:
                            best_gain = gain
                            best_val = val

                info_gain_list.append(best_gain)
                split_val_list.append(best_val)

            best_split_col = np.argmax(info_gain_list)
            best_split_value = split_val_list[best_split_col]
            X_left, X_right, y_left, y_right = partition_classes(X, y, best_split_col, best_split_value)
            tree['split_attribute'] = best_split_col
            tree['split_value'] = best_split_value
            tree['left_child'] = treeBuilding(X_left, y_left)
            tree['right_child'] = treeBuilding(X_right, y_right)
            return tree
    def divide_tree(self, val1, val2):
        val_x = val1[0][0]
        val_y, max_info_gain = 0, 0

        for i in range(len(val1[0])):
            caller_row = [row[i] for row in val1]
            caller_row = set(caller_row)
            for j in caller_row:
                y_l, y_r = partition_classes(val1, val2, i, j)[2:4]
                if information_gain(val2, [y_l, y_r]) > max_info_gain:
                    max_info_gain = information_gain(val2, [y_l, y_r])
                    val_y, val_x = i, j
        return val_y, val_x
        def get_split(dataset2, labels2, n_features2, side):
            counter = 0
            b_index, b_value, b_score, b_groups = 999, 999, 999, None
            IG_index = -1.0
            ig = -1.0
            features = list()

            while len(features) < n_features2:
                index = np.random.random_integers(len(dataset2[0]) - 2)
                if index not in features:
                    features.append(index)

            for index in features:
                col = []
                for row in dataset2:
                    col.append(row[index])
                colMean2 = np.mean(col)
                colMedian2 = np.median(col)

                X_left, X_right, y_left, y_right = partition_classes(
                    dataset2, labels2, index, colMean2)
                ig_mean = information_gain(labels2, [y_left, y_right])

                X_left_med, X_right_med, y_left_med, y_right_med = partition_classes(
                    dataset2, labels2, index, colMedian2)
                ig_median = information_gain(labels2,
                                             [y_left_med, y_right_med])

                ig = max(ig_mean, ig_median)
                if ig_median > ig_mean:
                    X_left, X_right, y_left, y_right = X_left_med, X_right_med, y_left_med, y_right_med
                #elif ig == 0.0:

                if len(y_left) == 0:
                    y_left = None
                    X_left = None
                if len(y_right) == 0:
                    y_right = None
                    X_right = None

                if ig > IG_index:
                    IG_index = ig
                    b_index = index  # row that split occurred on
                    b_value = colMean2  # value that split occurred
                    b_score = ig  # gini index that caused split
                    b_groups = [X_left, y_left, X_right, y_right]

            return {'index': b_index, 'value': b_value, 'groups': b_groups}
Beispiel #7
0
    def build_tree(self, X, y):

        #numerical_cols = set([0,1,2,3,4,5,6,7])
        numerical_cols = set([1, 2, 7, 10, 13, 14,
                              15])  # indices of numeric attributes (columns)
        if y.count(0) == len(y):
            self.tree["label"] = 0
            return

        if y.count(1) == len(y):
            self.tree["label"] = 1
            return

        if len(X) == 0:
            if (y.count(0) > y.count(1)):
                self.tree["label"] = 0
                return
            else:
                self.tree["label"] = 1
                return

        #find best attribute to split on
        X_arr = np.array(X)
        for col in range(X_arr.shape[1] - 1):

            #calculate mean for numerical values
            if col in numerical_cols:
                temp = X_arr[:, col]
                #print("temp = ", temp)
                split_val = np.mean([float(a) for a in temp])
                #print("split_val = " , split_val)

            #calculate mpde for numerical values
            else:
                split_val = stats.mode(X_arr[:, col])[0][0]

            #calculate infogain
            X_left, X_right, y_left, y_right = partition_classes(
                X, y, col, split_val)
            current_y = []
            current_y.append(y_left)
            current_y.append(y_right)
            current_info_gain = information_gain(y, current_y)

            #keep max infogain arguments
            if (self.tree['info_gain'] < current_info_gain):
                self.tree['info_gain'] = current_info_gain
                self.tree['split_val'] = split_val
                self.tree['split_attr'] = col

        #print("current tree: " , self.tree)
        if len(X) > 0:
            self.tree["left"] = DecisionTree()
            self.tree["right"] = DecisionTree()

            X_left, X_right, y_left, y_right = partition_classes(
                X, y, self.tree['split_attr'], self.tree['split_val'])

            self.tree["left"].build_tree(X_left, y_left)
            self.tree["right"].build_tree(X_right, y_right)
Beispiel #8
0
    def split_tree_based_on_max_info_gain(self, X, y):
        """
        Returns True if there's only one unique y left
        Else
        False

        If there is only one unique y value left,
        the decision tree has partitioned the data set
        to the lowest level of granularity where the leaf
        contains a single value

        Args:
            y: array of labels

        Returns:
            bool

        """
        split_value = X[0][0]
        row_one = X[0]
        split_column, max_info_gain = 0, 0

        for column in range(len(row_one)):
            unique_column = set([row[column] for row in X])
            for value in unique_column:
                y_left, y_right = partition_classes(X, y, column, value)[2:4]
                info_gain = information_gain(y, [y_left, y_right])
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    split_column, split_value = column, value
        return split_column, split_value
 def learnlearn(X, y):
     if entropy(y) == 0:  # all the same label -> end of tree
         return {'label': y[0]}
     best_split = {}  # split_attr,split_value,left,right
     max_IG = -1
     current_split = None
     for attribute in range(len(X[0])):  # attribute = column indices
         unique_value = np.unique([x[attribute] for x in X])
         for value in unique_value:
             X_left, X_right, y_left, y_right = partition_classes(
                 X, y, attribute, value)
             IG = information_gain(y, [y_left, y_right])
             if IG > max_IG:
                 max_IG = IG
                 current_split = [attribute, value]
     if max_IG == 0:  # just couldn't split better -> end of tree
         cnt_0_1 = np.bincount(y)
         return {'label': [1, 0][cnt_0_1[0] > cnt_0_1[1]]}
     # record and split
     best_split["split_attr"] = current_split[0]
     best_split["split_value"] = current_split[1]
     X_left, X_right, y_left, y_right = partition_classes(
         X, y, current_split[0], current_split[1])
     # next level
     best_split['left'] = learnlearn(X_left, y_left)
     best_split['right'] = learnlearn(X_right, y_right)
     return best_split
Beispiel #10
0
    def lookingforIG(self, X, y):
        a = np.array(X)
        rows = a.shape[0]
        columns = a.shape[1]
        ig_max = 0
        ig_row = 0
        ig_col = 0

        # looking for ig_max

        for col in range(columns):
            split_attr = col
            for row in range(rows):
                splitvalue = X[row][col]
                aaa = partition_classes(X, y, split_attr, splitvalue)
                # X_left = aaa[0]
                # X_right = aaa[1]
                y_left = aaa[2]
                y_right = aaa[3]
                if not y_left or not y_right:
                    continue
                ig_temp = information_gain(y, [y_left, y_right])
                if ig_temp > ig_max:
                    ig_max = ig_temp
                    ig_row = row
                    ig_col = col

        return ig_col, X[ig_row][ig_col]
Beispiel #11
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        gain_list = {}
        if entropy(y) != 0:
            for attr in range(len(X[0])):
                values = list(set([item[attr] for item in X]))
                for val in values:
                    X_left, X_right, y_left, y_right = partition_classes(
                        X, y, attr, val)
                    gain_list[(attr,
                               val)] = information_gain(y, [y_left, y_right])
            sp_attr, sp_val = list(gain_list.keys())[list(
                gain_list.values()).index(max(gain_list.values()))]
            X_left, X_right, y_left, y_right = partition_classes(
                X, y, sp_attr, sp_val)
            self.tree['split_attribute'] = sp_attr
            self.tree['split_val'] = sp_val
            self.tree['left'] = (X_left, y_left)
            self.tree['right'] = (X_right, y_right)
        pass
Beispiel #12
0
    def try_split(self, X, y):
        b_index, b_value, b_gain = -1, -1, float('-inf')

        X_arr = np.asarray(X, dtype=object)
        number_of_attr = X_arr.shape[1]
        idx = np.random.choice(range(number_of_attr),
                               int(np.floor(0.4 * number_of_attr)),
                               replace=False)
        for index in idx:
            partition_values = np.unique(X_arr[:, index], return_counts=False)
            if not type(partition_values[0]) == str:
                partition_values = np.percentile(
                    partition_values, [10 * i for i in range(1, 10, 2)])

            gain = np.array([
                information_gain(y, try_partition_classes(X, y, index, value))
                for value in partition_values
            ])
            max_gain = max(gain)
            if max_gain < self.threshold:
                continue
            else:
                if max_gain > b_gain:
                    b_index, b_value, b_gain = index, partition_values[
                        np.argmax(gain)], max_gain

        return (b_gain, b_index, b_value)
Beispiel #13
0
    def make_tree(self, X, y):

        if len(set(y)) == 1:
            return y[0]

        max_info_gain = -1
        index = None
        value = None

        X_left, X_right, y_left, y_right = [], [], [], []

        for i in range(len(X[0])):

            current = []
            for row in X:
                current.append(row[i])

            for split_val in set(current):
                X_left_t, X_right_t, y_left_t, y_right_t = partition_classes(
                    X, y, i, split_val)
                info_gain = information_gain(y, [y_left_t, y_right_t])
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    index, value = i, split_val
                    X_left, X_right, y_left, y_right = X_left_t, X_right_t, y_left_t, y_right_t
        node_data = (index, value)
        return {
            node_data:
            [self.make_tree(X_left, y_left),
             self.make_tree(X_right, y_right)]
        }
Beispiel #14
0
 def store_tree(self, X, Y, Z):
     for i in range(0,2):
         if Y.count(i) == len(Y):
             self.tree["lbl"] = i
             return        
     if len(Z) == 0:
         self.tree["lbl"] = (Y.count(0) <= Y.count(1)) + 0
     for j in Z:
         ig = information_gain(Y, partition_classes([i[j] for i in X], Y, self.tree["TH"]))
         if self.tree["Split"] == -1 or ig > self.tree["IG"]:
             self.tree["Split"] = j
             self.tree["IG"] = ig
             self.tree["TH"] = np.mean(np.array(X), axis = 0)[j]
     if len(Z) > 0:
         Z.remove(self.tree["Split"])
         self.tree["L"] = DecisionTree()
         self.tree["R"] = DecisionTree()
         Data = [[],[],[],[]]
         for i in range(len(X)):
             if X[i][self.tree["Split"]] <= self.tree["TH"]:
                 Data[0].append(X[i])
                 Data[1].append(Y[i])
             else:
                 Data[2].append(X[i])
                 Data[3].append(Y[i])
         self.tree["L"].store_tree(Data[0], Data[1], Z)
         self.tree["R"].store_tree(Data[2], Data[3], Z)
    def select_split(self, X, y):
        """
        Finds the best split attribute and value to maximize information gain.
        :param X: Numpy array of data
        :param y: Numpy array of classification
        :return: split_attr, split_val, split_data
        """
        max_info_gain = 0
        split_attr = 0
        split_val = None
        split_data = {'X_left': None, 'X_right': None,
                      'y_left': None, 'y_right': None}

        # Loop through each value and find the split that will maximize information gain.
        for row_idx in range(len(X)):
            for col_idx in range(len(X[0])):
                temp_attr = col_idx
                temp_val = X[row_idx][col_idx]
                temp_split_data = partition_classes(X, y, temp_attr, temp_val)
                temp_info_gain = information_gain(y, [temp_split_data[2], temp_split_data[3]])

                if temp_info_gain > max_info_gain:
                    max_info_gain = temp_info_gain
                    split_attr = temp_attr
                    split_val = temp_val
                    split_data['X_left'] = temp_split_data[0]
                    split_data['X_right'] = temp_split_data[1]
                    split_data['y_left'] = temp_split_data[2]
                    split_data['y_right'] = temp_split_data[3]

        return split_attr, split_val, split_data
Beispiel #16
0
 def find_best_numeric(X, y, split_attribute):
     '''
     find the split value (mean or median) with largest IG for a continuous variable
     Inputs:
         X: data containing all attributes
         y: labels
     split_attribute: column index of the attribute to split on
     '''
     if isinstance(np.array(X)[:, split_attribute][0],
                   np.int32) or isinstance(
                       np.array(X)[:, split_attribute][0], np.float64):
         all_values = [val for val in np.array(X)[:, split_attribute]]
     else:
         all_values = [
             ast.literal_eval(val)
             for val in np.array(X)[:, split_attribute]
         ]
     median = np.median(all_values)
     mean = np.mean(all_values)
     best_ig = 0
     best_val = 0
     for val in (median, mean):
         y_left = partition_classes(X, y, split_attribute, val)[2]
         y_right = partition_classes(X, y, split_attribute, val)[3]
         if len(y_left) * len(y_right) != 0:
             ig = information_gain(y, [y_left, y_right])
             if ig >= best_ig:
                 best_ig = ig
                 best_val = val
         else:
             continue
     return best_ig, best_val
Beispiel #17
0
    def learn(self, X, y):
        #Train the decision tree (self.tree) using the the sample X and labels y

        #test if y is the same
        y_sum = np.sum(y)
        if y_sum == len(y):
            self.tree['output'] = 1
            return self.tree
        elif y_sum == 0:
            self.tree['output'] = 0
            return self.tree

        #test if x is the same
        x_same = 1
        for i in range(len(X) - 1):
            if X[i] == X[-1]:
                x_same += 1
        if x_same == (len(X) - 1):
            self.tree['output'] = y_sum // len(y)
            return self.tree

        #compute information gain of each feature and split on best one
        split_attribute = 0
        info_gain = []
        split_vals = []
        for col in zip(*X):
            #split based on average
            split_val = np.average(col)
            split_vals.append(split_val)

            #split and compute information gain
            X_left, X_right, y_left, y_right = partition_classes(
                X, y, split_attribute, split_val)
            current_y = []
            current_y.append(y_left)
            current_y.append(y_right)
            ig = information_gain(y, current_y)
            info_gain.append(ig)

            #increment the column number
            split_attribute += 1

        #find highest info gain and split the tree there
        max_val = max(info_gain)
        max_ind = info_gain.index(max_val)
        split_val = split_vals[max_ind]
        X_left, X_right, y_left, y_right = partition_classes(
            X, y, max_ind, split_val)

        #store the column and value to split on for classifying and set output -1 to flag it is not a leaf node
        self.tree['split col'] = max_ind
        self.tree['split val'] = split_val
        self.tree['output'] = -1

        #right and left subtrees
        self.tree['left'] = DecisionTree().learn(X_left, y_left)
        self.tree['right'] = DecisionTree().learn(X_right, y_right)

        #return the tree
        return self.tree
Beispiel #18
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        # starting with the 1st sf

        if len(set(y)) == 1:
            return y[0]

        temp = 0
        split_attribute, split_val = 0, X[0][0]
        x_left, x_right, y_left, y_right = [], [], [], []
        for i in range(len(X[0])):
            for j in range(len(X)):
                print(X[j][i])
            infoGain = information_gain(y, [
                partition_classes(X, y, i, X[j][i])[2],
                partition_classes(X, y, i, X[j][i])[3]
            ])
            if infoGain > temp:
                temp = infoGain
                split_attribute = i
                split_val = X[j][i]
        DecisionTree().learn(x_left, y_left)
        DecisionTree().learn(x_right, y_right)
Beispiel #19
0
def find_best_split(n, k):
    x = list(np.random.choice(range(k), size=n, p=[0.8,0.05,0.1,0.05]))
    print(Counter(x))
    prior_entropy = calculate_entropy(x)

    best_ig = 0
    best_left = None
    best_right = None

    for m in range(1, len(x)):
        for left in set(itertools.combinations(x, m)):
            copy = x.copy()
            for elt in left:
                copy.remove(elt)
            right = copy

            ig = information_gain(left, right, prior_entropy)

            if ig > best_ig:
                best_ig = ig
                best_left = left
                best_right = right

    print(best_ig)
    print(best_left)
    print(best_right)
def helper(tree, X, y):
    if checkEqual(y):
        if len(y) == 0:
            tree["rst"] = 1
        else:
            tree["rst"] = y[0]
        tree["index"] = None
    else:
        size = len(X[0])
        gain, index, val = 0, 0, 0
        X_left, X_right, y_left, y_right = [], [], [], []
        for i in range(size):
            X_i = [item[i] for item in X]
            if isDiscrete(X_i):
                m = majority(X_i)
            else:
                m = np.mean(X_i)
            X_l, X_r, y_l, y_r = partition_classes(X, y, i, m)
            temp = information_gain(y, [y_l, y_r])
            if temp > gain:
                index, gain, val = i, temp, m
                X_left, X_right, y_left, y_right = X_l, X_r, y_l, y_r
        tree["index"] = index
        tree["split_val"] = val
        tree["rst"] = None
        tree["left"] = {}
        helper(tree["left"], X_left, y_left)
        tree["right"] = {}
        helper(tree["right"], X_right, y_right)
Beispiel #21
0
    def chooseBestAttr(self, X, y):

        best_split_attr = -1
        best_spilt_value = None
        best_info_gain = -1

        for i in range(len(X[0])):
            attr_list = [dt[i] for dt in X]
            split_list = []
            if isinstance(attr_list[0], str):
                sort_attr_list = sorted(list(set(attr_list)))
                for j in range(len(sort_attr_list)):
                    split_list.append(sort_attr_list[j])
            else:
                a = np.array(attr_list)
                split_list.append(np.mean(a))

            for split_value in split_list:
                partition_res = partition_classes(X, y, i, split_value)
                y_left = partition_res[2]
                y_right = partition_res[3]
                if not y_left or not y_right:
                    continue
                new_info_gain = information_gain(y, [y_left, y_right])
                if new_info_gain > best_info_gain:
                    best_info_gain = new_info_gain
                    best_split_attr = i
                    best_spilt_value = split_value
        return best_split_attr, best_spilt_value
def recursive(X,y):
    if y.count(y[0]) == len(y):
        return y[0]
    
    MaxIG = 0
    
    for i in range(len(X[0])):        
        column = [x[i] for x in X]
        for value in set(column):            
            current_y = partition(column, y, value)
            current_IG = information_gain(y, current_y)
                
            if current_IG > MaxIG:
                MaxIG = current_IG
                split_attribute = i
                split_value = value

    if MaxIG == 0:
        a = Counter(y)
        top = a.most_common(1)
        return top[0][0]
    
    X_left, X_right, y_left, y_right = partition_classes(X, y, split_attribute, split_value)
    
    return {(split_attribute, split_value):[recursive(X_left, y_left), recursive(X_right, y_right)]}
Beispiel #23
0
    def choose_attr_value(self, X, y):
        """Choose the attribute and value to split the tree that maximized the information gain criterion"""
        #https://stackoverflow.com/questions/44360162/how-to-access-a-column-in-a-list-of-lists-in-python


        max_split_attribute,max_split_val, max_information_gain = -1,-1,-1

        #The range is the number of attributes in the dataset
        for split_attribute_temp in range(len(X[0])):
            #For numbers
            if self.is_number(X[0][split_attribute_temp]):
                #print("numeric",X[0][split_attribute_temp])
                #If split_value is numerical then assign average value of the column
                column_item = [float(row[split_attribute_temp]) for row in X]
                split_val_temp = np.mean(column_item)
            #For strings
            else:
                #print("string",X[0][split_attribute_temp])
                #If split_value is string then assign the mode of the column
                #https://stackoverflow.com/questions/16330831/most-efficient-way-to-find-mode-in-numpy-array
                column_item = [row[split_attribute_temp] for row in X]
                (_, idx, counts) = np.unique(column_item, return_index=True, return_counts=True)
                index = idx[np.argmax(counts)]
                split_val_temp = column_item[index]
                
            
            X_left_, X_right_, y_left_, y_right_  = partition_classes(X,y,split_attribute_temp,split_val_temp)
            temp_information_gain = information_gain(y,[y_left_,y_right_])
            #Store the split with the best information gain
            if temp_information_gain > max_information_gain:
                max_split_attribute,max_split_val, max_information_gain = split_attribute_temp,split_val_temp,temp_information_gain
                best_X_left,best_X_right, best_y_left,best_y_right = X_left_, X_right_, y_left_, y_right_

        return max_split_attribute,max_split_val,best_X_left,best_X_right, best_y_left,best_y_right
Beispiel #24
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree
        max_ig = 0
        max_index = 0

        if y.count(y[0]) == len(y):
            self.tree['is_leaf'] = True
            self.tree['label'] = y[0]
            return

        if self.depth > 100:
            self.tree['is_leaf'] = True
            self.tree['label'] = self.max_element(y)
            return

        for index in range(len(X[0])):
            X_left, X_right, y_left, y_right = partition_classes(
                X, y, index,
                np.mean(X, axis=0)[index])
            ig = information_gain(y, [y_left, y_right])
            if ig > max_ig:
                max_ig = ig
                max_index = index
        X_left, X_right, y_left, y_right = partition_classes(
            X, y, max_index,
            np.mean(X, axis=0)[max_index])

        tig = information_gain(y, [y_left, y_right])

        if tig < 0.001:
            self.tree['is_leaf'] = True
            self.tree['label'] = self.max_element(y)
            return

        if len(X_left) == len(X) or len(X_right) == len(X):
            self.tree['is_leaf'] = True
            self.tree['label'] = self.max_element(y)
            return
        else:
            self.tree['is_leaf'] = False
            self.tree['split_attr'] = max_index
            self.tree['split_val'] = np.mean(X, axis=0)[max_index]
            self.depth += 1
            self.tree['left'] = self.learn(X_left, y_left)
            self.tree['right'] = self.learn(X_right, y_right)
            return self.tree['label']
        def _learn(X, y):
            Y_entropy = entropy(y)
            if Y_entropy == 0:
                return [-1, y[0], None, None]

            cur_max_gain = 0
            best_attr = []
            best_index = -1

            for index in data_length:

                if type(X[0][index]) == str:
                    is_str = True
                else:
                    is_str = False
                if is_str == True:
                    attr_X = np.unique([X[i][index] for i in range(len(X))])

                    for attr in attr_X:
                        X_left, X_right, y_left, y_right = partition_classes(
                            X, y, index, attr)
                        gain = information_gain(y, [y_left, y_right])
                        if gain > cur_max_gain:
                            cur_max_gain = gain
                            best_index = index
                            best_attr = attr
                            best_X_left, best_X_right = X_left, X_right
                            best_y_left, best_y_right = y_left, y_right
                else:
                    attr_X = np.mean([X[i][index] for i in range(len(X))])
                    X_left, X_right, y_left, y_right = partition_classes(
                        X, y, index, attr_X)
                    gain = information_gain(y, [y_left, y_right])
                    if gain > cur_max_gain:
                        cur_max_gain = gain
                        best_index = index
                        best_attr = attr_X
                        best_X_left, best_X_right = X_left, X_right
                        best_y_left, best_y_right = y_left, y_right

            if cur_max_gain <= 0:
                return [-1, np.argmax(np.bincount(y)), None, None]

            left = _learn(best_X_left, best_y_left)
            right = _learn(best_X_right, best_y_right)

            return [best_index, best_attr, left, right]
Beispiel #26
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        #pass

        max_info_gain, max_attribute, max_value = -1, 0, 0

        X_left = []
        X_right = []
        y_left = []
        y_right = []

        self.tree_depth = 0
        self.id = None

        if self.tree_depth > 10 or entropy(y) <= 0:
            self.id = y[0]
            return

        for i in range(0, len(X[0])):
            values = [X[j][i] for j in range(0, len(X))]
            # choose the split value according to its average in order to reduce the running time
            if isinstance(values[0], str):
                split_avg = values[0]
            else:
                split_avg = sum(values) / len(values)

            xLeft, xRight, yLeft, yRight = partition_classes(
                X, y, i, split_avg)
            current = []
            current.append(yLeft)
            current.append(yRight)
            temp = information_gain(y, current)
            if temp > max_info_gain:
                max_attribute = i
                max_value = split_avg
                max_info_gain = temp
                X_left = xLeft
                X_right = xRight
                y_left = yLeft
                y_right = yRight

        #build tree
        self.tree['max_attribute'], self.tree[
            'max_value'] = max_attribute, max_value
        self.tree['L'], self.tree['R'] = DecisionTree(), DecisionTree()

        #grow tree
        self.tree['L'].learn(X_left, y_left)
        self.tree['R'].learn(X_right, y_right)
        self.tree['L'].tree_depth = self.tree_depth + 1
        self.tree['R'].tree_depth = self.tree_depth + 1
Beispiel #27
0
 def ___calculate_gain(self, X, y, attribute, values):
     # Calculate information gain for given attribute
     splits = {}
     if isinstance(values[0], str):
         targets = np.unique(values)
         new_y = []
         for s in targets:
             xl, _, yl, _ = partition_classes(X, y, attribute, s)
             splits[s] = [[xl], [yl]]
             new_y.append(yl)
         gain = information_gain(y, new_y)
     else:
         s = np.mean(values)
         xl, xr, yl, yr = partition_classes(X, y, attribute, s)
         splits[s] = [[xl, xr], [yl, yr]]
         gain = information_gain(y, [yl, yr])
     return gain, splits
Beispiel #28
0
    def learn(self, X, y):
        #Train the decision tree (self.tree) using the the sample X and labels y
        #The functions in utils.py are used to train the tree

        #* Method used to implement the tree:
        #*    Each node in self.tree is in the form of a dictionary:
        #*       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #*    For example, a non-leaf node with two children can have a 'left' key and  a
        #*    'right' key.
        leny = len(set(y))
        #print(leny)
        if leny == 1:
            self.tree['label'] = y[0]
            return
        if leny == 0:
            self.tree['label'] = 0
            return

        x_left = []
        x_right = []
        y_left = []
        y_right = []

        maxInfoGain = 0
        maxIndex = 0
        best_split_attr = 0
        split_val = 0

        terminate_case = 0, 0.0, 0

        for i in range(len(X) - 4):

            for j in range(len(X[0])):
                split_val_update = X[i][j]
                x_l, x_r, y_l, y_r = partition_classes(X, y, j,
                                                       split_val_update)
                y_update = [y_l, y_r]
                infoGain = information_gain(y, y_update)
                if infoGain > maxInfoGain:

                    maxInfoGain = infoGain
                    split_val = split_val_update
                    best_split_attr = j
                    testing = i, maxInfoGain, best_split_attr

                    x_left = x_l
                    x_right = x_r
                    y_left = y_l
                    y_right = y_r

        self.tree['left'] = DecisionTree()
        self.tree['right'] = DecisionTree()
        self.tree['left'].learn(x_left, y_left)
        self.tree['right'].learn(x_right, y_right)
        self.tree['attribute'] = best_split_attr
        self.tree['value'] = split_val

        pass
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree
		
		# for loop to iterate through each column, outside of this function
		# distinct values (set(new_words))
		# of distinct values len(set(new_words))
		# for categorical, 
		# also iterate through the split values for each attribute
        
        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a 
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
		
		info_gain = 0
		best = {}
		best['IG'] = 0
		
		zipped = (zip(*X))
		sf_column = list(zipped[sf])
		sf_range = [min(sf_column), max(sf_column)]
		sv_list = (np.random.uniform(sf_range[0],sf_range[1],25))

		for sf in range(0, len(X)-1): # for all split attributes
		
			for val in sv_list: # for split values 
		
				[X_left, X_right, y_left, y_right] = partition_classes(X, y, sf, sv)
			
				cur_y = [y_left, y_right]
				prev_y = y
				
				info_gain = information_gain(prev_y, cur_y)
				if (info_gain > best['IG']):
					best['IG'] = info_gain
					best['sf'] = sf
					best['sv'] = val
				
		# after choosing the best split feature and the best split value,
		# partition the classes and set the .left = X_left and the .right = X_right 
		[X_left, X_right, y_left, y_right] = partition_classes(X, y, sf, sv)
		
		# The recursively call learn on X_left and X_right
		left = addNode(X_left, y_left)
		right = addNode(X_right, y_right)
				
		self.tree['left'] = left
		self.tree['right'] = right
		self.tree['sv'] = sv
		self.tree['sf'] = sf
Beispiel #30
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree
        #
        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)

        # If only 1 y label, set that as label
        if len(set(y)) == 1:
            self.tree['label'] = y[0]
            return
        elif len(set(y)) == 0:
            self.tree['label'] = 0
            return

        # Create variables with initial values
        max_info_gain = 0

        # Initialize variables for loop
        x_len = len(X)
        row_len = len(X[0])

        # Find maximum information gain by looping through every possible partition
        for i in range(x_len):
            for j in range(row_len):

                test_split_val = X[i][j]
                xL, xR, yL, yR = partition_classes(X, y, j, test_split_val)
                info_gain = information_gain(y, [yL, yR])

                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    split_attr = j
                    split_val = test_split_val
                    x_left, y_left, x_right, y_right = xL, yL, xR, yR

        # Create left and right trees
        self.tree['left'] = DecisionTree()
        self.tree['right'] = DecisionTree()

        # Train left and right trees
        self.tree['left'].learn(x_left, y_left)
        self.tree['right'].learn(x_right, y_right)

        # Store split attribute and split value in tree
        self.tree['split_attribute'] = split_attr
        self.tree['split_value'] = split_val
        pass