コード例 #1
0
def get_best_feature_gini(data_set, features_list, is_features_discrete):
    best_gini_index = 100
    best_gini_index_index = -1
    continuous_feature_value = None
    for i in range(len(features_list)):
        # 处理离散型特征
        feature_values_list = [sample[i] for sample in data_set]
        feature_values_list = set(feature_values_list)
        if is_features_discrete[i] == 1:
            gini_index = 0.0
            # 在一个feature上根据feature的取值的不同,划分成不同的sub data set
            for value in feature_values_list:
                sub_data_set = split_data_set_by_operate(data_set,
                                                         i,
                                                         value,
                                                         operator.eq,
                                                         delete_col=False)
                prob = float(len(sub_data_set)) / len(data_set)
                gini_index += prob * calculate_gini(sub_data_set)
            if gini_index < best_gini_index:
                best_gini_index = gini_index
                best_gini_index_index = i
        # 处理连续型特征
        else:
            continuous_best_gini_index = 100
            continuous_best_feature_value = -1
            feature_values_list = sorted(feature_values_list, reverse=False)
            feature_values_mid_value_list = []
            # 对于连续型特征,将其都有的取值进行排序,取连续两个取值的中值作为待测试划分
            for j in range(len(feature_values_list) - 1):
                feature_values_mid_value_list.append(
                    (feature_values_list[j] + feature_values_list[j + 1]) / 2)
            # 对于所有计算得到的中值,进行测试
            for value in feature_values_mid_value_list:
                # 一个中值可以将数据分为两部分
                sub_data_set_le = split_data_set_by_operate(data_set,
                                                            i,
                                                            value,
                                                            operator.le,
                                                            delete_col=True)
                sub_data_set_gt = split_data_set_by_operate(data_set,
                                                            i,
                                                            value,
                                                            operator.gt,
                                                            delete_col=True)
                prob = float(len(sub_data_set_le)) / len(data_set)
                gini_index = prob * calculate_gini(sub_data_set_le) + (
                    1 - prob) * calculate_gini(sub_data_set_gt)
                if gini_index < continuous_best_gini_index:
                    continuous_best_gini_index = gini_index
                    continuous_best_feature_value = value
            if continuous_best_gini_index < best_gini_index:
                best_gini_index = continuous_best_gini_index
                best_gini_index_index = i
                continuous_feature_value = continuous_best_feature_value
    return best_gini_index_index, continuous_feature_value
コード例 #2
0
def tree_generator_with_post_pruning(input_tree, data_set_train,
                                     data_set_validate, features_list,
                                     features_dict, is_features_discrete):
    """
    后剪枝 树生成
    :param input_tree:
    :param data_set_train:
    :param data_set_validate:
    :param features_list:
    :param features_dict:
    :param is_features_discrete:
    :return:
    """
    if type(input_tree).__name__ == 'str':
        return input_tree
    feature_name = list(input_tree.keys())[0]
    sub_tree = input_tree[feature_name]
    feature_index = features_list.index(feature_name)

    for value in sub_tree:
        value_key = get_keys_for_dict(features_dict[feature_name], value)[0]

        new_features_list = features_list[:]
        new_is_features_discrete = is_features_discrete[:]
        del (new_features_list[feature_index])
        del (new_is_features_discrete[feature_index])

        sub_data_set_train = split_data_set_by_operate(data_set_train,
                                                       feature_index,
                                                       value_key,
                                                       operator.eq,
                                                       delete_col=True)
        sub_data_set_validate = split_data_set_by_operate(data_set_validate,
                                                          feature_index,
                                                          value_key,
                                                          operator.eq,
                                                          delete_col=True)
        input_tree[feature_name][value] = tree_generator_with_post_pruning(
            sub_tree[value], sub_data_set_train, sub_data_set_validate,
            new_features_list, features_dict, new_is_features_discrete)
    # 到达某个结点之后,测试一下准确率,看看是否要剪枝
    # get_validation_error_count_by_major_class(major_class, data_set_validate)
    # get_validation_error_count_by_tree(input_tree, data_set_validate, features_list)
    error_count_by_tree = get_validation_error_count_by_tree(
        input_tree, data_set_validate, features_list, features_dict,
        is_features_discrete)
    error_count_by_major_class = get_validation_error_count_by_major_class(
        get_most_common_class(data_set_train), data_set_validate)
    if error_count_by_tree <= error_count_by_major_class:
        return input_tree
    return get_most_common_class(data_set_train)
コード例 #3
0
def tree_generate_without_pruning(data_set_train, features_list, features_dict,
                                  is_features_discrete):
    """
    树生成 不剪枝
    :param data_set_train:
    :param features_list:
    :param features_dict:
    :param is_features_discrete:
    :return:
    """
    # deep copy
    features_list = features_list[:]
    is_features_discrete = is_features_discrete[:]

    # 检查样本是否已经同属于一类了
    class_list = [sample[-1] for sample in data_set_train]
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0]

    # 检查features_list是否为空, dataSet在feature上的取值都一样(所有样本在所有属性上的取值一样)
    if len(features_list) == 0 or is_all_sample_same(data_set_train):
        return get_most_common_class(data_set_train)

    # 从A中找出最优的属性值,进行划分
    best_feature_index, best_continuous_feature_value = get_best_feature_gini(
        data_set_train, features_list, is_features_discrete)
    best_feature_name = features_list[best_feature_index]

    if is_features_discrete[best_feature_index] == 1:
        # 如果该特征是离散型的,从数据中删除该特征
        del (features_list[best_feature_index])
        del (is_features_discrete[best_feature_index])
        tree = {best_feature_name: {}}
        feature_values_list = features_dict[best_feature_name].keys()

        for feature_value in feature_values_list:
            sub_data_set_train = split_data_set_by_operate(data_set_train,
                                                           best_feature_index,
                                                           feature_value,
                                                           operator.eq,
                                                           delete_col=True)
            feature_value_name = features_dict[best_feature_name][
                feature_value]
            # 如果划分出来的子属性集合为空,则将分支结点标记为叶节点,其分类标记为data_set中样本最多的类
            if len(sub_data_set_train) == 0:
                tree[best_feature_name][
                    feature_value_name] = get_most_common_class(data_set_train)
            # 如果划分出来的子属性集合不为空,则继续递归
            else:
                tree[best_feature_name][feature_value_name] = \
                    tree_generate_without_pruning(sub_data_set_train, features_list, features_dict, is_features_discrete)
    else:
        # 如果该特征是连续的,不需要从数据中删除该特征
        # 与离散属性不同,若当前结点划分属性为连续属性,该属性还可作为其后代结点的划分属性
        key = best_feature_name + '<=' + str.format(
            "%0.3f" % best_continuous_feature_value)
        tree = {key: {}}
        sub_data_set_le = split_data_set_by_operate(
            data_set_train,
            best_feature_index,
            best_continuous_feature_value,
            operator.le,
            delete_col=False)
        sub_data_set_gt = split_data_set_by_operate(
            data_set_train,
            best_feature_index,
            best_continuous_feature_value,
            operator.gt,
            delete_col=False)
        tree[key]['是'] = tree_generate_without_pruning(sub_data_set_le,
                                                       features_list,
                                                       features_dict,
                                                       is_features_discrete)
        tree[key]['否'] = tree_generate_without_pruning(sub_data_set_gt,
                                                       features_list,
                                                       features_dict,
                                                       is_features_discrete)
    return tree
コード例 #4
0
def tree_generate_with_pre_pruning(data_set_train, data_set_validate,
                                   features_list, features_dict,
                                   is_features_discrete):
    """
    预剪枝 树生成
    :param data_set_train:
    :param data_set_validate:
    :param features_list:
    :param features_dict:
    :param is_features_discrete:
    :return:
    """
    # deep copy
    features_list = features_list[:]
    is_features_discrete = is_features_discrete[:]

    # 检查样本是否已经同属于一类了
    class_list = [sample[-1] for sample in data_set_train]
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0]

    # 检查features_list是否为空, dataSet在feature上的取值都一样(所有样本在所有属性上的取值一样)
    if len(features_list) == 0 or is_all_sample_same(data_set_train):
        return get_most_common_class(data_set_train)

    # 从A中找出最优的属性值,进行划分
    best_feature_index, best_continuous_feature_value = get_best_feature_gini(
        data_set_train, features_list, is_features_discrete)
    best_feature_name = features_list[best_feature_index]

    accuracy_rate_before_pruning = get_validation_error_before_pruning(
        data_set_train, data_set_validate)
    accuracy_rate_after_pruning = get_validation_error_after_pruning(
        data_set_train, data_set_validate, best_feature_index,
        is_features_discrete[best_feature_index],
        best_continuous_feature_value)
    # 划分之后的准确率没有超过划分之前的准确率,不再进行划分
    if accuracy_rate_before_pruning >= accuracy_rate_after_pruning:
        most_common_class = get_most_common_class(data_set_train)
        return most_common_class
    # 划分之后的准确率超过了划分之前的准确率,继续进行划分
    else:
        if is_features_discrete[best_feature_index] == 1:
            del (features_list[best_feature_index])
            del (is_features_discrete[best_feature_index])
            tree = {best_feature_name: {}}
            feature_values_list = [
                sample[best_feature_index] for sample in data_set_train
            ]
            feature_values_list = set(feature_values_list)

            for feature_value in feature_values_list:
                sub_data_set_train = split_data_set_by_operate(
                    data_set_train,
                    best_feature_index,
                    feature_value,
                    operator.eq,
                    delete_col=True)
                sub_data_set_validate = split_data_set_by_operate(
                    data_set_validate,
                    best_feature_index,
                    feature_value,
                    operator.eq,
                    delete_col=True)
                feature_value_name = features_dict[best_feature_name][
                    feature_value]
                tree[best_feature_name][feature_value_name] = \
                    tree_generate_with_pre_pruning(sub_data_set_train, sub_data_set_validate,
                                                   features_list, features_dict, is_features_discrete)
        else:
            key = best_feature_name + '<=' + str.format(
                "%0.3f" % best_continuous_feature_value)
            tree = {key: {}}
            # 划分数据集
            sub_data_set_train_le = split_data_set_by_operate(
                data_set_train,
                best_feature_index,
                best_continuous_feature_value,
                operator.le,
                delete_col=False)
            sub_data_set_validate_le = split_data_set_by_operate(
                data_set_validate,
                best_feature_index,
                best_continuous_feature_value,
                operator.le,
                delete_col=False)
            sub_data_set_train_gt = split_data_set_by_operate(
                data_set_train,
                best_feature_index,
                best_continuous_feature_value,
                operator.gt,
                delete_col=False)
            sub_data_set_validate_gt = split_data_set_by_operate(
                data_set_validate,
                best_feature_index,
                best_continuous_feature_value,
                operator.gt,
                delete_col=False)
            # 生成节点
            tree[key]['是'] = tree_generate_with_pre_pruning(
                sub_data_set_train_le, sub_data_set_validate_le, features_list,
                features_dict, is_features_discrete)
            tree[key]['否'] = tree_generate_with_pre_pruning(
                sub_data_set_train_gt, sub_data_set_validate_gt, features_list,
                features_dict, is_features_discrete)
        return tree
def tree_generate_with_random_feature_selection(data_set, features_list,
                                                features_dict,
                                                is_features_discrete):
    """
    通过随机选择特征来生成决策树
    :param data_set:
    :param features_list:
    :param features_dict:
    :param is_features_discrete:
    :return:
    """
    # deep copy
    features_list = features_list[:]
    is_features_discrete = is_features_discrete[:]
    #
    samples_class = [sample[-1] for sample in data_set]
    if samples_class.count(samples_class[0]) == len(samples_class):
        return samples_class[0]
    if len(features_list) == 0 or is_all_sample_same(data_set):
        return get_most_common_class(data_set)
    best_feature_index = random.randint(0, len(features_list) - 1)
    best_feature_name = features_list[best_feature_index]

    if is_features_discrete[best_feature_index] == 1:
        # 如果该特征是离散型的,从数据中删除该特征
        del (features_list[best_feature_index])
        del (is_features_discrete[best_feature_index])
        tree = {best_feature_name: {}}
        feature_value_set = features_dict[best_feature_name].keys()

        for feature_value in feature_value_set:
            # 如果没有重新拷贝一份,只要每往下一层,就会删除features_list中的一个数据,
            # 但是递归返回时的往另外一个分支走的时候就会出问题

            sub_data_set_train = split_data_set_by_operate(data_set,
                                                           best_feature_index,
                                                           feature_value,
                                                           operator.eq,
                                                           delete_col=True)
            feature_value_name = features_dict[best_feature_name][
                feature_value]
            # 如果划分出来的子属性集合为空,则将分支结点标记为叶节点,其分类标记为data_set中样本最多的类
            if len(sub_data_set_train) == 0:
                tree[best_feature_name][
                    feature_value_name] = get_most_common_class(data_set)
            # 如果划分出来的子属性集合不为空,则继续递归
            else:
                tree[best_feature_name][feature_value_name] = \
                    tree_generate_with_random_feature_selection(sub_data_set_train, features_list, features_dict,
                                                                is_features_discrete)
    else:
        # 如果该特征是连续的
        feature_values_mid_value_list = []
        feature_value_set = [sample[best_feature_index] for sample in data_set]
        feature_value_set = set(feature_value_set)
        feature_value_set = sorted(feature_value_set, reverse=False)
        for i in range(len(feature_value_set) - 1):
            feature_values_mid_value_list.append(
                (feature_value_set[i] + feature_value_set[i + 1]) / 2)
        # 从中值中随机选择一个数作为best_continuous_feature_value
        best_continuous_feature_value = random.choice(
            feature_values_mid_value_list)
        key = best_feature_name + '<=' + str.format(
            "%0.3f" % best_continuous_feature_value)
        tree = {key: {}}
        sub_data_set_le = split_data_set_by_operate(
            data_set,
            best_feature_index,
            best_continuous_feature_value,
            operator.le,
            delete_col=False)
        sub_data_set_gt = split_data_set_by_operate(
            data_set,
            best_feature_index,
            best_continuous_feature_value,
            operator.gt,
            delete_col=False)
        tree[key]['是'] = tree_generate_with_random_feature_selection(
            sub_data_set_le, features_list, features_dict,
            is_features_discrete)
        tree[key]['否'] = tree_generate_with_random_feature_selection(
            sub_data_set_gt, features_list, features_dict,
            is_features_discrete)
    return tree
コード例 #6
0
def get_best_feature(data_set, features, is_features_discrete, ID3_or_C45):
    """
    从features中找出最优划分属性
    :param data_set:
    :param features:
    :return:
    """
    feature_num = len(features)
    base_entropy = calculate_information_entropy(data_set)
    gain_list = []
    gain_ratio_list = []
    continuous_feature_value_list = []
    epsilon = 1e-5
    for i in range(feature_num):
        # 处理离散型的特征,各个特征已经划分成特定的子集了,不需要再次进行划分
        if is_features_discrete[i] == 1:
            feature_values_list = [sample[i] for sample in data_set]
            feature_values_list = set(feature_values_list)
            entropy = 0.0
            iv = 0.0
            for feature_value in feature_values_list:
                sub_data_set = split_data_set_by_operate(data_set,
                                                         i,
                                                         feature_value,
                                                         operator.eq,
                                                         delete_col=True)
                prob = float(len(sub_data_set)) / len(data_set)
                entropy += prob * calculate_information_entropy(sub_data_set)
                iv += prob * log(prob, 2)
            gain = base_entropy - entropy
            gain_ratio = gain / (-iv + epsilon)
            gain_list.append(gain)
            gain_ratio_list.append(gain_ratio)
            continuous_feature_value_list.append(None)
        # 处理连续型的特征,需要手动进行划分,对特征中的各个值先从小排到大,再计算出连续两个值的中值,作为待定划分点
        # 找到一个让信息熵最大的划分点
        else:
            continuous_best_gain = 0.0
            continuous_best_feature_value = -1
            continuous_best_gain_ratio = 0.0

            feature_values_list = [sample[i] for sample in data_set]
            feature_values_list = set(feature_values_list)
            feature_values_list = sorted(feature_values_list, reverse=False)
            feature_values_mid_value_list = []
            for j in range(len(feature_values_list) - 1):
                feature_values_mid_value_list.append(
                    (feature_values_list[j] + feature_values_list[j + 1]) / 2)
            for feature_value in feature_values_mid_value_list:
                sub_data_set_le = split_data_set_by_operate(data_set,
                                                            i,
                                                            feature_value,
                                                            operator.le,
                                                            delete_col=True)
                sub_data_set_gt = split_data_set_by_operate(data_set,
                                                            i,
                                                            feature_value,
                                                            operator.gt,
                                                            delete_col=True)
                prob = float(len(sub_data_set_le)) / len(data_set)
                entropy = prob * calculate_information_entropy(sub_data_set_le) + \
                          (1-prob) * calculate_information_entropy(sub_data_set_gt)
                gain = base_entropy - entropy
                iv = prob * log(prob, 2) + (1 - prob) * log(1 - prob, 2)
                gain_ratio = gain / (-iv)
                if gain > continuous_best_gain:
                    continuous_best_gain = gain
                    continuous_best_feature_value = feature_value
                    continuous_best_gain_ratio = gain_ratio
            gain_list.append(continuous_best_gain)
            gain_ratio_list.append(continuous_best_gain_ratio)
            continuous_feature_value_list.append(continuous_best_feature_value)
    # ID3直接找信息增益最高的那一个
    if ID3_or_C45 == 0:
        max_gain_index = gain_list.index(max(gain_list))
        return max_gain_index, continuous_feature_value_list[max_gain_index]
    # C45先从候选划分属性中找出信息增益高于平均水平的属性,再从中选择增益率最高的
    elif ID3_or_C45 == 1:
        average_gain = 0.0
        for i in range(len(gain_list)):
            average_gain += gain_list[i]
        average_gain = average_gain / len(gain_list)
        max_gain_ratio = 0
        max_gain_ratio_index = -1
        for i in range(len(gain_list)):
            if gain_list[i] > average_gain and gain_ratio_list[
                    i] > max_gain_ratio:
                max_gain_ratio = gain_ratio_list[i]
                max_gain_ratio_index = i
        return max_gain_ratio_index, continuous_feature_value_list[
            max_gain_ratio_index]
    else:
        raise NameError('ID3_or_C45 should be 0 or 1')
コード例 #7
0
def tree_generate(data_set, features_list, features_dict, is_features_discrete,
                  ID3_or_C45):
    # deep copy
    # 使用new features_list来替代features_list,传递到tree_generate函数,tree_generate会删除features_list中的内容,
    # 如果一直传递同一个features_list,会出现问题,new features_list = features_list[:],相当于拷贝一个新的features list
    features_list = features_list[:]
    is_features_discrete = is_features_discrete[:]

    # 检查样本是否已经同属于一类了
    class_list = [sample[-1] for sample in data_set]
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0]

    # 检查features是否为空, dataSet在features上的取值都一样(所有样本在所有属性上的取值一样)
    if len(features_list) == 0 or is_all_sample_same(data_set):
        return get_most_common_class(data_set)

    # 从A中找出最优的属性值,进行划分
    best_feature_index, best_continuous_feature_value = get_best_feature(
        data_set, features_list, is_features_discrete, ID3_or_C45)
    best_feature_name = features_list[best_feature_index]
    # 如果该特征是离散型的,从数据中删除该特征
    if is_features_discrete[best_feature_index] == 1:
        tree = {best_feature_name: {}}

        del (is_features_discrete[best_feature_index])
        del (features_list[best_feature_index])

        feature_values_list = features_dict[best_feature_name].keys()
        for feature_value in feature_values_list:
            # 往下继续生成树
            sub_data_set = split_data_set_by_operate(data_set,
                                                     best_feature_index,
                                                     feature_value,
                                                     operator.eq,
                                                     delete_col=True)
            feature_value_name = features_dict[best_feature_name][
                feature_value]
            # 如果划分出来的子属性集合为空,则将分支结点标记为叶节点,其分类标记为data_set中样本最多的类
            if len(sub_data_set) == 0:
                tree[best_feature_name][
                    feature_value_name] = get_most_common_class(data_set)
            # 如果划分出来的子属性集合不为空,则继续递归
            else:
                tree[best_feature_name][feature_value_name] = \
                    tree_generate(sub_data_set, features_list, features_dict, is_features_discrete, ID3_or_C45)
    # 如果该特征是连续的,不需要从数据中删除该特征
    # 与离散属性不同,若当前结点划分属性为连续属性,该属性还可作为其后代结点的划分属性
    else:
        key = best_feature_name + '<=' + str.format(
            "%0.3f" % best_continuous_feature_value)
        tree = {key: {}}
        tree[key]['是'] = tree_generate(
            split_data_set_by_operate(data_set,
                                      best_feature_index,
                                      best_continuous_feature_value,
                                      operator.le,
                                      delete_col=False), features_list,
            features_dict, is_features_discrete, ID3_or_C45)
        tree[key]['否'] = tree_generate(
            split_data_set_by_operate(data_set,
                                      best_feature_index,
                                      best_continuous_feature_value,
                                      operator.gt,
                                      delete_col=False), features_list,
            features_dict, is_features_discrete, ID3_or_C45)
    return tree