Example #1
0
    def __build_tree(self, X, y, n_features, feature_indices, depth):

        node_data_set = np.column_stack((X, y))
        sample_size = len(y)

        if len(y) <= self.min_samples_split or (depth != None
                                                and depth == self.max_depth):

            estimated_value = np.mean(y)  #

            leaf = Leaf(estimated_value=estimated_value,
                        sample_size=sample_size,
                        leaf_data_set=node_data_set)
            return leaf

        #寻找分裂属性和最优分裂点
        best_feature_index, threshold, min_mes = find_split(
            X, y, self.criterion, feature_indices)

        X_true, y_true, X_false, y_false = split(X, y, best_feature_index,
                                                 threshold)  # 分成左子树和右子树

        node = Node(feature_index=best_feature_index,
                    threshold=threshold,
                    min_mes=min_mes,
                    sample_size=sample_size,
                    node_data_set=node_data_set)

        # # 随机的选特征
        feature_indices = random.sample(range(n_features),
                                        int(self.max_features))
        ## 递归的创建左子树
        node.branch_true = self.__build_tree(X_true, y_true, n_features,
                                             feature_indices, depth + 1)

        ## 随机的选特征
        feature_indices = random.sample(range(n_features),
                                        int(self.max_features))

        node.branch_false = self.__build_tree(X_false, y_false, n_features,
                                              feature_indices, depth + 1)

        return node
Example #2
0
    def build_tree(self, X, y, feature_indices,fa_feature_index,select_feature_fa, father_node,depth):
        """
        建立决策树
        X :
        y:
        feature_indices:随机选择的特征集合
        fa_feature_index:父节点选择的哪个特征作为分裂特征,、初始时为-1,
        depth :树的深度
        select_feature_fa :记录当前节点的父节点的最优分割属性
        """
        select_feature_fa.append(fa_feature_index)
        n_features = X.shape[1]
        n_features_list = [i for i in range(n_features)]
        #记录选择的特征
        self.select_feature.append(feature_indices)
        self.sample_num.append(len(y))

        node_data_set = np.column_stack((X, y))

        # 树终止条件
        if self.criterion == 'entropy':
            if depth is self.max_depth or len(y) < self.min_samples_split or entropy(y) is 0:
                return mode(y)[0][0]# 返回y数组的众数

        # 树终止条件
        if self.criterion == 'gini':
            temp_gini = gini(y)
            self.gini_.append(temp_gini)
            sample_num = len(y)
            if depth is self.max_depth or sample_num < self.min_samples_split or temp_gini < self.min_impurity_split:
            # if depth is self.max_depth or temp_gini < self.min_impurity_split:

                #所有的特征都已经被选择了,就随机选择一个特征,使得叶子节点构成双特征
                if set(n_features_list) == set(select_feature_fa):
                    index = random.randrange(len(n_features_list))
                    current_feature_index = n_features_list[index]
                    current_max_value = np.max(X[:, current_feature_index])
                    current_min_value = np.min(X[:, current_feature_index])

                else:
                    to_be_select = list(set(n_features_list) - set(select_feature_fa))
                    index = random.randrange(len(to_be_select))

                    current_feature_index = to_be_select[index]
                    current_max_value = np.max(X[:, current_feature_index])
                    current_min_value = np.min(X[:, current_feature_index])

                leaf = Leaf(mode(y)[0][0],fa_feature_index , np.max(X[:,fa_feature_index]),
                            np.min(X[:,fa_feature_index]),current_feature_index,current_max_value,
                            current_min_value,select_feature_fa,node_data_set,sample_num,prior_node= father_node)
                self.leaf_list.append(leaf)
                return leaf

        # feature_index最佳分割属性, threshold 最佳分割属性值,gini_ 系数
        feature_index, threshold, max_value ,min_value ,gini_ = find_split(X, y, self.criterion, feature_indices)

        fa_max_value = np.max(X[:, fa_feature_index])  # 该节点记录父节点分裂特征的最大值
        fa_min_value = np.min(X[:, fa_feature_index])  # 该节点记录父节点分裂特征的最小值

        X_true, y_true, X_false, y_false = split(X, y, feature_index, threshold)# 分成左子树和右子树

        # 没有元素
        if y_true.shape[0] is 0 or y_false.shape[0] is 0:

            if set(n_features_list) == set(select_feature_fa):
                index = random.randrange(len(n_features_list))
                current_feature_index = n_features_list[index]
                current_max_value = np.max(X[:, current_feature_index])
                current_min_value = np.min(X[:, current_feature_index])

            else:
                to_be_select = list(set(n_features_list) - set(select_feature_fa))
                index = random.randrange(len(to_be_select))

                current_feature_index = to_be_select[index]
                current_max_value = np.max(X[:, current_feature_index])
                current_min_value = np.min(X[:, current_feature_index])

            leaf = Leaf(mode(y)[0][0], fa_feature_index, np.max(X[:, fa_feature_index]), np.min(X[:, fa_feature_index]),
                        current_feature_index,current_max_value,current_min_value,select_feature_fa,node_data_set,prior_node= father_node,sample_num= 0)

            self.leaf_list.append(leaf)
            return leaf

        node = Node(feature_index=feature_index,
                    fa_feature_index = fa_feature_index,
                    threshold = threshold, max_value = max_value, min_value = min_value,
                    fa_max_value = fa_max_value, fa_min_value = fa_min_value,
                    gini_coefficient = gini_,
                    node_data_set = node_data_set)


        # # 随机的选特征
        n_features = X.shape[1]
        n_sub_features = int(self.max_features)
        #
        feature_indices = random.sample(range(n_features), n_sub_features)
        select_feature = list()
        select_feature += select_feature_fa  # 记录节点选择的特征
        ## 递归的创建左子树
        node.branch_true = self.build_tree(X_true, y_true, feature_indices,feature_index,
                                           select_feature,node,depth + 1)

        ## 随机的选特征
        feature_indices = random.sample(range(n_features), n_sub_features)
        # 递归的创建右子树
        select_feature = list()
        select_feature += select_feature_fa  # 记录节点选择的特征
        node.branch_false = self.build_tree(X_false, y_false, feature_indices,feature_index,
                                            select_feature,node,depth + 1)

        node.prior_node = father_node #指向前驱节点

        return node