Python data_describe Examples

Programming Language: Python

Namespace/Package Name: dsct_tools

Method/Function: data_describe

Examples at hotexamples.com: 2

Python data_describe - 2 examples found. These are the top rated real world Python examples of dsct_tools.data_describe extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: Dependency_dsct.py Project: zhangyifei1/Discretization

def Chi_Discretization(data, var_name, var_name_target, max_interval=6, binning_method = 'chi2', feature_type = 0):
    """
    基于卡方的离散化方法
    :param data: DataFrame 原始输入数据
    :param var_name: str 待离散化变量
    :param var_name_target: str 标签变量（y)
    :param max_interval: int 最大分箱数量
    :param binning_method: string 分箱方法
    :param feature_type: bool 待分箱变量的类型（0: 连续型变量  1：离散型变量）
    :return: 分组信息（group）
    """
    
    # 1. 初始化：将每个值视为一个箱体 & 统计各取值的正负样本分布并排序
    print("分箱初始化开始：")
    count, var_type = data_describe(data, var_name, var_name_target, feature_type)
    print("分箱初始化完成！！！")
    
    # 2. 卡方分箱
    if binning_method == 'chiMerge':
        # group = Chi_Merge(count,max_interval)
        group = Chi_Merge1(count,max_interval)
    elif binning_method == 'chi2':
        group = Chi2(count,max_interval)
    else:
        exit(code='无法识别分箱方法')
        
    # 后处理
    if not feature_type:
        group = [sorted(ele) for ele in group]
    group.sort()
    
    if len(group) > max_interval:
        print("warning: 分箱后，%s的箱体个数（%s）与您输入的分箱数量（%s）不符，这是由分组间的相似性太低导致的。如对分箱效果不满意，请更换其他分箱方法" % (
        var_name, len(group), max_interval))

    # 3. 根据var_type修改返回的group样式(var_type=0: 返回分割点列表；var_typ=1：返回分箱成员列表）
    if not feature_type:
        group = [ele[-1] for ele in group] if len(group[0])==1 else [group[0][0]] + [ele[-1] for ele in group]
        group[0] = group[0]-0.001 if group[0]==0 else group[0] * (1-0.001) # 包含最小值
        group[-1] = group[-1]+0.001 if group[-1]==0 else group[-1] * (1+0.001) # 包含最大值
    return group

Example #2

Show file

File: Entropy_dsct.py Project: zhangyifei1/Discretization

def Entropy_Discretization(data,
                           var_name,
                           var_name_target,
                           max_interval=6,
                           binning_method='entropy',
                           feature_type=0):
    """
    基于熵的离散化方法
    :param data: DataFrame 原始输入数据
    :param var_name: str 待离散化变量
    :param var_name_target: str 离散化后的变量
    :param max_interval: int 最大分箱数量
    :param binning_method: string 分箱方法
    :param var_type: bool 待分箱变量的类型（0: 连续型变量  1：离散型变量）
    :return: 分组信息（group）
    """

    # 1. 初始化：将每个值视为一个箱体 & 统计各取值的正负样本分布并排序
    print("分箱初始化开始：")
    count, var_type = data_describe(data, var_name, var_name_target,
                                    feature_type)
    print("分箱初始化完成！！！")

    # 2. 决策树分箱
    if binning_method in ['entropy', 'bestKS']:
        group = BestKS_dsct(count, max_interval, binning_method)
        # group = optimal_binning_boundary(data['A'], data['E'], 4)
    else:
        exit(code='无法识别分箱方法')
    group.sort()

    # 3. 根据var_type修改返回的group样式(var_type=0: 返回分割点列表；var_typ=1：返回分箱成员列表）
    if not feature_type:
        group = [ele[-1] for ele in group] if len(
            group[0]) == 1 else [group[0][0]] + [ele[-1] for ele in group]
        group[0] = group[0] - 0.001 if group[0] == 0 else group[0] * (
            1 - 0.001)  # 包含最小值
        group[-1] = group[-1] + 0.001 if group[-1] == 0 else group[-1] * (
            1 + 0.001)  # 包含最大值
    return group