def get_best_feature_gini(data_set, features_list, is_features_discrete): best_gini_index = 100 best_gini_index_index = -1 continuous_feature_value = None for i in range(len(features_list)): # 处理离散型特征 feature_values_list = [sample[i] for sample in data_set] feature_values_list = set(feature_values_list) if is_features_discrete[i] == 1: gini_index = 0.0 # 在一个feature上根据feature的取值的不同,划分成不同的sub data set for value in feature_values_list: sub_data_set = split_data_set_by_operate(data_set, i, value, operator.eq, delete_col=False) prob = float(len(sub_data_set)) / len(data_set) gini_index += prob * calculate_gini(sub_data_set) if gini_index < best_gini_index: best_gini_index = gini_index best_gini_index_index = i # 处理连续型特征 else: continuous_best_gini_index = 100 continuous_best_feature_value = -1 feature_values_list = sorted(feature_values_list, reverse=False) feature_values_mid_value_list = [] # 对于连续型特征,将其都有的取值进行排序,取连续两个取值的中值作为待测试划分 for j in range(len(feature_values_list) - 1): feature_values_mid_value_list.append( (feature_values_list[j] + feature_values_list[j + 1]) / 2) # 对于所有计算得到的中值,进行测试 for value in feature_values_mid_value_list: # 一个中值可以将数据分为两部分 sub_data_set_le = split_data_set_by_operate(data_set, i, value, operator.le, delete_col=True) sub_data_set_gt = split_data_set_by_operate(data_set, i, value, operator.gt, delete_col=True) prob = float(len(sub_data_set_le)) / len(data_set) gini_index = prob * calculate_gini(sub_data_set_le) + ( 1 - prob) * calculate_gini(sub_data_set_gt) if gini_index < continuous_best_gini_index: continuous_best_gini_index = gini_index continuous_best_feature_value = value if continuous_best_gini_index < best_gini_index: best_gini_index = continuous_best_gini_index best_gini_index_index = i continuous_feature_value = continuous_best_feature_value return best_gini_index_index, continuous_feature_value
def tree_generator_with_post_pruning(input_tree, data_set_train, data_set_validate, features_list, features_dict, is_features_discrete): """ 后剪枝 树生成 :param input_tree: :param data_set_train: :param data_set_validate: :param features_list: :param features_dict: :param is_features_discrete: :return: """ if type(input_tree).__name__ == 'str': return input_tree feature_name = list(input_tree.keys())[0] sub_tree = input_tree[feature_name] feature_index = features_list.index(feature_name) for value in sub_tree: value_key = get_keys_for_dict(features_dict[feature_name], value)[0] new_features_list = features_list[:] new_is_features_discrete = is_features_discrete[:] del (new_features_list[feature_index]) del (new_is_features_discrete[feature_index]) sub_data_set_train = split_data_set_by_operate(data_set_train, feature_index, value_key, operator.eq, delete_col=True) sub_data_set_validate = split_data_set_by_operate(data_set_validate, feature_index, value_key, operator.eq, delete_col=True) input_tree[feature_name][value] = tree_generator_with_post_pruning( sub_tree[value], sub_data_set_train, sub_data_set_validate, new_features_list, features_dict, new_is_features_discrete) # 到达某个结点之后,测试一下准确率,看看是否要剪枝 # get_validation_error_count_by_major_class(major_class, data_set_validate) # get_validation_error_count_by_tree(input_tree, data_set_validate, features_list) error_count_by_tree = get_validation_error_count_by_tree( input_tree, data_set_validate, features_list, features_dict, is_features_discrete) error_count_by_major_class = get_validation_error_count_by_major_class( get_most_common_class(data_set_train), data_set_validate) if error_count_by_tree <= error_count_by_major_class: return input_tree return get_most_common_class(data_set_train)
def tree_generate_without_pruning(data_set_train, features_list, features_dict, is_features_discrete): """ 树生成 不剪枝 :param data_set_train: :param features_list: :param features_dict: :param is_features_discrete: :return: """ # deep copy features_list = features_list[:] is_features_discrete = is_features_discrete[:] # 检查样本是否已经同属于一类了 class_list = [sample[-1] for sample in data_set_train] if class_list.count(class_list[0]) == len(class_list): return class_list[0] # 检查features_list是否为空, dataSet在feature上的取值都一样(所有样本在所有属性上的取值一样) if len(features_list) == 0 or is_all_sample_same(data_set_train): return get_most_common_class(data_set_train) # 从A中找出最优的属性值,进行划分 best_feature_index, best_continuous_feature_value = get_best_feature_gini( data_set_train, features_list, is_features_discrete) best_feature_name = features_list[best_feature_index] if is_features_discrete[best_feature_index] == 1: # 如果该特征是离散型的,从数据中删除该特征 del (features_list[best_feature_index]) del (is_features_discrete[best_feature_index]) tree = {best_feature_name: {}} feature_values_list = features_dict[best_feature_name].keys() for feature_value in feature_values_list: sub_data_set_train = split_data_set_by_operate(data_set_train, best_feature_index, feature_value, operator.eq, delete_col=True) feature_value_name = features_dict[best_feature_name][ feature_value] # 如果划分出来的子属性集合为空,则将分支结点标记为叶节点,其分类标记为data_set中样本最多的类 if len(sub_data_set_train) == 0: tree[best_feature_name][ feature_value_name] = get_most_common_class(data_set_train) # 如果划分出来的子属性集合不为空,则继续递归 else: tree[best_feature_name][feature_value_name] = \ tree_generate_without_pruning(sub_data_set_train, features_list, features_dict, is_features_discrete) else: # 如果该特征是连续的,不需要从数据中删除该特征 # 与离散属性不同,若当前结点划分属性为连续属性,该属性还可作为其后代结点的划分属性 key = best_feature_name + '<=' + str.format( "%0.3f" % best_continuous_feature_value) tree = {key: {}} sub_data_set_le = split_data_set_by_operate( data_set_train, best_feature_index, best_continuous_feature_value, operator.le, delete_col=False) sub_data_set_gt = split_data_set_by_operate( data_set_train, best_feature_index, best_continuous_feature_value, operator.gt, delete_col=False) tree[key]['是'] = tree_generate_without_pruning(sub_data_set_le, features_list, features_dict, is_features_discrete) tree[key]['否'] = tree_generate_without_pruning(sub_data_set_gt, features_list, features_dict, is_features_discrete) return tree
def tree_generate_with_pre_pruning(data_set_train, data_set_validate, features_list, features_dict, is_features_discrete): """ 预剪枝 树生成 :param data_set_train: :param data_set_validate: :param features_list: :param features_dict: :param is_features_discrete: :return: """ # deep copy features_list = features_list[:] is_features_discrete = is_features_discrete[:] # 检查样本是否已经同属于一类了 class_list = [sample[-1] for sample in data_set_train] if class_list.count(class_list[0]) == len(class_list): return class_list[0] # 检查features_list是否为空, dataSet在feature上的取值都一样(所有样本在所有属性上的取值一样) if len(features_list) == 0 or is_all_sample_same(data_set_train): return get_most_common_class(data_set_train) # 从A中找出最优的属性值,进行划分 best_feature_index, best_continuous_feature_value = get_best_feature_gini( data_set_train, features_list, is_features_discrete) best_feature_name = features_list[best_feature_index] accuracy_rate_before_pruning = get_validation_error_before_pruning( data_set_train, data_set_validate) accuracy_rate_after_pruning = get_validation_error_after_pruning( data_set_train, data_set_validate, best_feature_index, is_features_discrete[best_feature_index], best_continuous_feature_value) # 划分之后的准确率没有超过划分之前的准确率,不再进行划分 if accuracy_rate_before_pruning >= accuracy_rate_after_pruning: most_common_class = get_most_common_class(data_set_train) return most_common_class # 划分之后的准确率超过了划分之前的准确率,继续进行划分 else: if is_features_discrete[best_feature_index] == 1: del (features_list[best_feature_index]) del (is_features_discrete[best_feature_index]) tree = {best_feature_name: {}} feature_values_list = [ sample[best_feature_index] for sample in data_set_train ] feature_values_list = set(feature_values_list) for feature_value in feature_values_list: sub_data_set_train = split_data_set_by_operate( data_set_train, best_feature_index, feature_value, operator.eq, delete_col=True) sub_data_set_validate = split_data_set_by_operate( data_set_validate, best_feature_index, feature_value, operator.eq, delete_col=True) feature_value_name = features_dict[best_feature_name][ feature_value] tree[best_feature_name][feature_value_name] = \ tree_generate_with_pre_pruning(sub_data_set_train, sub_data_set_validate, features_list, features_dict, is_features_discrete) else: key = best_feature_name + '<=' + str.format( "%0.3f" % best_continuous_feature_value) tree = {key: {}} # 划分数据集 sub_data_set_train_le = split_data_set_by_operate( data_set_train, best_feature_index, best_continuous_feature_value, operator.le, delete_col=False) sub_data_set_validate_le = split_data_set_by_operate( data_set_validate, best_feature_index, best_continuous_feature_value, operator.le, delete_col=False) sub_data_set_train_gt = split_data_set_by_operate( data_set_train, best_feature_index, best_continuous_feature_value, operator.gt, delete_col=False) sub_data_set_validate_gt = split_data_set_by_operate( data_set_validate, best_feature_index, best_continuous_feature_value, operator.gt, delete_col=False) # 生成节点 tree[key]['是'] = tree_generate_with_pre_pruning( sub_data_set_train_le, sub_data_set_validate_le, features_list, features_dict, is_features_discrete) tree[key]['否'] = tree_generate_with_pre_pruning( sub_data_set_train_gt, sub_data_set_validate_gt, features_list, features_dict, is_features_discrete) return tree
def tree_generate_with_random_feature_selection(data_set, features_list, features_dict, is_features_discrete): """ 通过随机选择特征来生成决策树 :param data_set: :param features_list: :param features_dict: :param is_features_discrete: :return: """ # deep copy features_list = features_list[:] is_features_discrete = is_features_discrete[:] # samples_class = [sample[-1] for sample in data_set] if samples_class.count(samples_class[0]) == len(samples_class): return samples_class[0] if len(features_list) == 0 or is_all_sample_same(data_set): return get_most_common_class(data_set) best_feature_index = random.randint(0, len(features_list) - 1) best_feature_name = features_list[best_feature_index] if is_features_discrete[best_feature_index] == 1: # 如果该特征是离散型的,从数据中删除该特征 del (features_list[best_feature_index]) del (is_features_discrete[best_feature_index]) tree = {best_feature_name: {}} feature_value_set = features_dict[best_feature_name].keys() for feature_value in feature_value_set: # 如果没有重新拷贝一份,只要每往下一层,就会删除features_list中的一个数据, # 但是递归返回时的往另外一个分支走的时候就会出问题 sub_data_set_train = split_data_set_by_operate(data_set, best_feature_index, feature_value, operator.eq, delete_col=True) feature_value_name = features_dict[best_feature_name][ feature_value] # 如果划分出来的子属性集合为空,则将分支结点标记为叶节点,其分类标记为data_set中样本最多的类 if len(sub_data_set_train) == 0: tree[best_feature_name][ feature_value_name] = get_most_common_class(data_set) # 如果划分出来的子属性集合不为空,则继续递归 else: tree[best_feature_name][feature_value_name] = \ tree_generate_with_random_feature_selection(sub_data_set_train, features_list, features_dict, is_features_discrete) else: # 如果该特征是连续的 feature_values_mid_value_list = [] feature_value_set = [sample[best_feature_index] for sample in data_set] feature_value_set = set(feature_value_set) feature_value_set = sorted(feature_value_set, reverse=False) for i in range(len(feature_value_set) - 1): feature_values_mid_value_list.append( (feature_value_set[i] + feature_value_set[i + 1]) / 2) # 从中值中随机选择一个数作为best_continuous_feature_value best_continuous_feature_value = random.choice( feature_values_mid_value_list) key = best_feature_name + '<=' + str.format( "%0.3f" % best_continuous_feature_value) tree = {key: {}} sub_data_set_le = split_data_set_by_operate( data_set, best_feature_index, best_continuous_feature_value, operator.le, delete_col=False) sub_data_set_gt = split_data_set_by_operate( data_set, best_feature_index, best_continuous_feature_value, operator.gt, delete_col=False) tree[key]['是'] = tree_generate_with_random_feature_selection( sub_data_set_le, features_list, features_dict, is_features_discrete) tree[key]['否'] = tree_generate_with_random_feature_selection( sub_data_set_gt, features_list, features_dict, is_features_discrete) return tree
def get_best_feature(data_set, features, is_features_discrete, ID3_or_C45): """ 从features中找出最优划分属性 :param data_set: :param features: :return: """ feature_num = len(features) base_entropy = calculate_information_entropy(data_set) gain_list = [] gain_ratio_list = [] continuous_feature_value_list = [] epsilon = 1e-5 for i in range(feature_num): # 处理离散型的特征,各个特征已经划分成特定的子集了,不需要再次进行划分 if is_features_discrete[i] == 1: feature_values_list = [sample[i] for sample in data_set] feature_values_list = set(feature_values_list) entropy = 0.0 iv = 0.0 for feature_value in feature_values_list: sub_data_set = split_data_set_by_operate(data_set, i, feature_value, operator.eq, delete_col=True) prob = float(len(sub_data_set)) / len(data_set) entropy += prob * calculate_information_entropy(sub_data_set) iv += prob * log(prob, 2) gain = base_entropy - entropy gain_ratio = gain / (-iv + epsilon) gain_list.append(gain) gain_ratio_list.append(gain_ratio) continuous_feature_value_list.append(None) # 处理连续型的特征,需要手动进行划分,对特征中的各个值先从小排到大,再计算出连续两个值的中值,作为待定划分点 # 找到一个让信息熵最大的划分点 else: continuous_best_gain = 0.0 continuous_best_feature_value = -1 continuous_best_gain_ratio = 0.0 feature_values_list = [sample[i] for sample in data_set] feature_values_list = set(feature_values_list) feature_values_list = sorted(feature_values_list, reverse=False) feature_values_mid_value_list = [] for j in range(len(feature_values_list) - 1): feature_values_mid_value_list.append( (feature_values_list[j] + feature_values_list[j + 1]) / 2) for feature_value in feature_values_mid_value_list: sub_data_set_le = split_data_set_by_operate(data_set, i, feature_value, operator.le, delete_col=True) sub_data_set_gt = split_data_set_by_operate(data_set, i, feature_value, operator.gt, delete_col=True) prob = float(len(sub_data_set_le)) / len(data_set) entropy = prob * calculate_information_entropy(sub_data_set_le) + \ (1-prob) * calculate_information_entropy(sub_data_set_gt) gain = base_entropy - entropy iv = prob * log(prob, 2) + (1 - prob) * log(1 - prob, 2) gain_ratio = gain / (-iv) if gain > continuous_best_gain: continuous_best_gain = gain continuous_best_feature_value = feature_value continuous_best_gain_ratio = gain_ratio gain_list.append(continuous_best_gain) gain_ratio_list.append(continuous_best_gain_ratio) continuous_feature_value_list.append(continuous_best_feature_value) # ID3直接找信息增益最高的那一个 if ID3_or_C45 == 0: max_gain_index = gain_list.index(max(gain_list)) return max_gain_index, continuous_feature_value_list[max_gain_index] # C45先从候选划分属性中找出信息增益高于平均水平的属性,再从中选择增益率最高的 elif ID3_or_C45 == 1: average_gain = 0.0 for i in range(len(gain_list)): average_gain += gain_list[i] average_gain = average_gain / len(gain_list) max_gain_ratio = 0 max_gain_ratio_index = -1 for i in range(len(gain_list)): if gain_list[i] > average_gain and gain_ratio_list[ i] > max_gain_ratio: max_gain_ratio = gain_ratio_list[i] max_gain_ratio_index = i return max_gain_ratio_index, continuous_feature_value_list[ max_gain_ratio_index] else: raise NameError('ID3_or_C45 should be 0 or 1')
def tree_generate(data_set, features_list, features_dict, is_features_discrete, ID3_or_C45): # deep copy # 使用new features_list来替代features_list,传递到tree_generate函数,tree_generate会删除features_list中的内容, # 如果一直传递同一个features_list,会出现问题,new features_list = features_list[:],相当于拷贝一个新的features list features_list = features_list[:] is_features_discrete = is_features_discrete[:] # 检查样本是否已经同属于一类了 class_list = [sample[-1] for sample in data_set] if class_list.count(class_list[0]) == len(class_list): return class_list[0] # 检查features是否为空, dataSet在features上的取值都一样(所有样本在所有属性上的取值一样) if len(features_list) == 0 or is_all_sample_same(data_set): return get_most_common_class(data_set) # 从A中找出最优的属性值,进行划分 best_feature_index, best_continuous_feature_value = get_best_feature( data_set, features_list, is_features_discrete, ID3_or_C45) best_feature_name = features_list[best_feature_index] # 如果该特征是离散型的,从数据中删除该特征 if is_features_discrete[best_feature_index] == 1: tree = {best_feature_name: {}} del (is_features_discrete[best_feature_index]) del (features_list[best_feature_index]) feature_values_list = features_dict[best_feature_name].keys() for feature_value in feature_values_list: # 往下继续生成树 sub_data_set = split_data_set_by_operate(data_set, best_feature_index, feature_value, operator.eq, delete_col=True) feature_value_name = features_dict[best_feature_name][ feature_value] # 如果划分出来的子属性集合为空,则将分支结点标记为叶节点,其分类标记为data_set中样本最多的类 if len(sub_data_set) == 0: tree[best_feature_name][ feature_value_name] = get_most_common_class(data_set) # 如果划分出来的子属性集合不为空,则继续递归 else: tree[best_feature_name][feature_value_name] = \ tree_generate(sub_data_set, features_list, features_dict, is_features_discrete, ID3_or_C45) # 如果该特征是连续的,不需要从数据中删除该特征 # 与离散属性不同,若当前结点划分属性为连续属性,该属性还可作为其后代结点的划分属性 else: key = best_feature_name + '<=' + str.format( "%0.3f" % best_continuous_feature_value) tree = {key: {}} tree[key]['是'] = tree_generate( split_data_set_by_operate(data_set, best_feature_index, best_continuous_feature_value, operator.le, delete_col=False), features_list, features_dict, is_features_discrete, ID3_or_C45) tree[key]['否'] = tree_generate( split_data_set_by_operate(data_set, best_feature_index, best_continuous_feature_value, operator.gt, delete_col=False), features_list, features_dict, is_features_discrete, ID3_or_C45) return tree