Esempio n. 1
0
def tree_grow(x, y, nfeat, nmin=2, minleaf=1):
    """
    Input parameters:
        x (2D array): Data matrix
        y (1D array): Binary class labels
        nmin (int): Min # observations a node must contain for it to be allowed to split
        minleaf (int): Min # observations required for a leaf node
        nfeat (int): # features to be considered for each split
    Outputs:
        Tree object based on best splits of gini index impurity reduction function
    """
    print("GROWING CLASSIFICATION TREE")
    #print(f" x type = {type(x)}, y type = {type(y)}")
    #print(f"for x = {x[0:20]}, y={y[0:20]}, nmin = {nmin}, minleaf = {minleaf}")
    # each node has a name, list of indices (records), and "leaf" boolean attribute
    root = Node('root', indices=np.arange(0, x.shape[0]), leaf=False)
    nodelist = [root]
    split_nr = 0  # will be used for node names
    while nodelist:  # while nodelist not empty
        split_nr += 1
        current_node = nodelist.pop(
            0
        )  # get node from nodelist TODO: choose random node or first on list?
        #print(f"Processing node  {current_node}")
        # TODO: skip this if nfeat not specified? Adjust optional nfeat in tree_grow def
        if nfeat:
            feat_list = random.sample(
                list(np.arange(0, x.shape[1])),
                k=nfeat)  # randomly draw nfeat col indices from # cols of x
        else:
            feat_list = list(
                np.arange(0, x.shape[1])
            )  # feat_list is simply indices of all columns of x (except first = indices)

        [feat, split_val] = best_split(x, y, current_node, feat_list, minleaf)

        if feat == None and split_val == None:  # no possible split found
            current_node.leaf = True
            # add class prediction label to leaf node:
            current_node.y = y[current_node.indices]
            if sum((current_node.y) / len(current_node.y)) > 0.5:
                current_node.prediction = 1
            else:
                current_node.prediction = 0
        else:  # choose split with highest impurity reduction
            current_node.split_feat = feat  # add feature (col nr of x) and split value by which node will be split
            current_node.split_val = split_val
            # from indices in current nodes (current_node.indices), select those where value in column f > split_val
            indices_left = current_node.indices[x[current_node.indices,
                                                  feat] > split_val]
            left = Node(f"L{split_nr}",
                        parent=current_node,
                        indices=indices_left)
            # if child node too small for splitting or we have a pure node (impurity=0), make it a leaf node:
            if (len(indices_left) < nmin) or (impurity(y[indices_left]) == 0):
                left.leaf = True
                left.y = y[indices_left]
                if sum((left.y) / len(left.y)) > 0.5:
                    left.prediction = 1
                else:
                    left.prediction = 0
            else:  # add to nodelist
                left.leaf = False
                nodelist.append(left)
            indices_right = np.setdiff1d(
                current_node.indices, indices_left
            )  # indices_right = indices in current node not in indices_left
            right = Node(f"R{split_nr}",
                         parent=current_node,
                         indices=indices_right)
            if (len(indices_right) < nmin) or (impurity(y[indices_right])
                                               == 0):  # make child leaf node
                right.leaf = True
                right.y = y[indices_right]
                if (sum(right.y) / len(right.y)) > 0.5:
                    right.prediction = 1
                else:
                    right.prediction = 0
            else:  # add to nodelist
                right.leaf = False
                nodelist.append(right)
    print(f"TREE DONE")  #\n {RenderTree(root)}")
    return root
Esempio n. 2
0
def tree_grow(x, y, nfeat, nmin=2, minleaf=1):
    """
    Parameters:
        x (2D array): Data matrix
        y (1D array): Binary class labels
        nmin (int): Min # observations a node must contain for it to be allowed to split
        minleaf (int): Min # observations required for a leaf node
        nfeat (int): # features to be considered for each split

    Returns:
        Tree object based on best splits of gini index impurity reduction function
    """
    # each node has a name, list of indices (records), and "leaf" boolean attribute
    root = Node('root',
                indices=np.arange(0, x.shape[0]),
                leaf=False,
                prediction=None,
                split_feat=None,
                split_val=None)
    nodelist = [root]
    split_nr = 0  # will be used for node names
    while nodelist:  # while nodelist not empty
        split_nr += 1
        current_node = nodelist.pop(
            0
        )  # get node from nodelist TODO: choose random node or first on list?
        print(
            f"\n PROCESSING NEW NODE {current_node} on subtree: x = \n {x[current_node.indices, :]} \n y = {y[current_node.indices]}"
        )
        # TODO: skip this if nfeat not specified? Adjust optional nfeat in tree_grow def
        if nfeat:
            feat_list = random.sample(
                list(np.arange(0, x.shape[1])),
                k=nfeat)  # randomly draw nfeat col indices from # cols of x
        else:
            feat_list = list(
                np.arange(0, x.shape[1])
            )  # feat_list is simply indices of all columns of x (except first = indices)
        poss_splits = [
        ]  # will contain for each feature index (in col1), the impurity reduction (col2) & split value (col3) of the best split
        for f in feat_list:
            # find the best split (based on gini index) for rows of x specific by current_node.indices list, based on col f
            [reduction_val,
             split_val] = bestsplit_of_col(x[current_node.indices, f],
                                           y[current_node.indices], minleaf)
            if reduction_val != 0:  # if found a split which is allowed, then this is the best split for feature f
                poss_splits.append([f, reduction_val, split_val])
        if not poss_splits:  # no possible split found
            current_node.leaf = True
            # add class prediction label to leaf node:
            if sum(current_node.indices) / len(current_node.indices) > 0.5:
                current_node.prediction = 1
            else:
                current_node.prediction = 0
        else:  # choose split with highest impurity reduction:
            poss_splits.sort(
                key=lambda x: x[1], reverse=True
            )  # sort poss_splits list by the 2nd column (reduction values) in descending order
            # TODO: add tiebraker in case of 2 features with identical impurity reduction -> keep list of features previsouly used for splitting?
            feat = poss_splits[0][0]
            split_val = poss_splits[0][2]
            current_node.split_feat = feat  # add feature (col nr of x) and split value by which node will be split
            current_node.split_val = split_val
            # from indices in current nodes (current_node.indices), select those where value in column f > split_val
            indices_left = current_node.indices[x[current_node.indices,
                                                  feat] > split_val]
            left = Node(f"L{split_nr}",
                        parent=current_node,
                        indices=indices_left)
            # if child node too small for splitting or we have a pure node (impurity=0), make it a leaf node:
            if (len(indices_left) < nmin) or (impurity(y[indices_left]) == 0):
                left.leaf = True
                left.y = y[indices_left]
                if sum((left.y) / len(left.y)) > 0.5:
                    left.prediction = 1
                else:
                    left.prediction = 0
            else:  # make child node and add to nodelist
                left.leaf = False
                nodelist.append(left)
            indices_right = np.setdiff1d(
                current_node.indices, indices_left
            )  # indices_right = indices in current node not in indices_left
            right = Node(f"R{split_nr}",
                         parent=current_node,
                         indices=indices_right)
            if (len(indices_right) < nmin) or (impurity(y[indices_right])
                                               == 0):  # make child leaf node
                right.leaf = True
                right.y = y[indices_right]
                if (sum(right.y) / len(right.y)) > 0.5:
                    right.prediction = 1
                else:
                    right.prediction = 0
            else:  # make child node and add to nodelist
                right.leaf = False
                nodelist.append(right)
            print(
                f"Finished processing node, tree now looks as follows: \n {RenderTree(root)}"
            )
    print(f"\n TREE DONE: \n {RenderTree(root)}")
    return root