def gen_stats_value_ftr51(stats_name, size='400d'):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 , 7d, 15d, 30d, 45d
    :return:
    """

    feature_name = '{}_ftr51_by_{}'.format(stats_name, size)
    # 0 读取数据
    train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                        sort_by_time=True)
    train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True)
    train_test_data = pd.concat([train_data, test_data],
                                axis=0,
                                ignore_index=True)
    # 计算统计字典
    print('1 computing stats value of ftr51 by  {}'.format(size))
    ftr51_stats_value_df = train_test_data[[
        'PERSONID', 'CREATETIME', 'FTR51'
    ]].groupby('PERSONID').apply(
        lambda df_person: compute_stats_value_FTR51_by_size(
            df_person, stats_name, size)).to_frame(feature_name).reset_index()
    train_test_id = train_test_id.merge(ftr51_stats_value_df,
                                        on=['PERSONID'],
                                        how='left')
    train_id[feature_name] = train_test_id[feature_name][:15000].values
    test_id[feature_name] = train_test_id[feature_name][15000:].values
    SaveFeature(train_id, test_id, feature_name)
    print('Finished Computing {} \n'.format(feature_name))
    return feature_name, 'gen_stats_value_ftr51("{}", "{}")'.format(
        stats_name, size)
Beispiel #2
0
def gen_stats_cost_by_non_zero_group(cost, stats_name='mean_mean', size='7d', recompute=False):
    """
    对诊疗次数的统计, 窗口可以是月或全局, 颗粒度天单位
    :param cost: str, 项目名称
    :param stats_name: str, 统计名    :param df_person:
    :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, 或 '1t'粒度为每次
    :param recompute: bool,是否重新计算该特征
    :return:
    """
    # 1
    # ['len_max', 'len_max_ratio', 'len_mean', 'len_std', 'len_count',
    #                           'sum_max', 'sum_max_ratio', 'sum_mean', 'sum_std',
    #                           'mean_max', 'mean_std', 'mean_mean']
    feature_name = '{}_{}_by_non_zero_group_{}'.format(stats_name, cost, size)
    if IsAbsense(feature_name) | recompute:
        # 2 compute feature
        print('compute {}'.format(feature_name))
        # 2.1 读取数据
        train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True)
        train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True)
        train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
        # 2.3 计算count df
        stats_df = train_test_data[['PERSONID', 'CREATETIME', cost]].groupby('PERSONID').apply(
            lambda df_person: stats_cost_by_non_zero_group(df_person, cost, stats_name, size)).to_frame(feature_name).reset_index()
        # 2.4 merge 拼接
        train_test_id = train_test_id.merge(stats_df, on=['PERSONID'], how='left')
        # 2.5 保存特征
        train_id[feature_name] = train_test_id[feature_name][:15000].values
        test_id[feature_name] = train_test_id[feature_name][15000:].values
        SaveFeature(train_id, test_id, feature_name)
        print('Finished Computing {} \n'.format(feature_name))
        return feature_name, 'gen_stats_cost_by_non_zero_group("{}", "{}", "{}")'.format(cost, stats_name, size)
    else:
        print('The Feature has already been computed \n')
        return feature_name, 'gen_stats_cost_by_non_zero_group("{}", "{}", "{}")'.format(cost, stats_name, size)
Beispiel #3
0
def gen_cat_vector_from_ftr51(kinds):
    """
    为train_test_data
    :param kinds: str, A, D, AB
    :return:
    """
    print('compute {} vector for train_test_data'.format(kinds))
    matrix_name = '{}_vector_from_ftr51'.format(kinds)
    # 0 读取数据
    train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                        sort_by_time=True)
    train_test_data = pd.concat([train_data, test_data],
                                axis=0,
                                ignore_index=True)
    # 计数字典字典
    train_test_data['cat_count_dict'] = train_test_data['FTR51'].map(
        lambda ftr51s: compute_cat_count_dict_from_ftr51s(ftr51s, kinds))
    # 对齐
    v = DictVectorizer()
    # 计算统计向量
    cat_sparse_matrix = v.fit_transform(
        train_test_data['cat_count_dict'].values)
    sparse.save_npz(get_path() + 'Data/Feature/{}.npz'.format(matrix_name),
                    cat_sparse_matrix)

    return matrix_name, 'gen_cat_vector_from_ftr51("{}")'.format(kinds)
def gen_stats_count(stats_name,
                    month='global',
                    size='1d',
                    non_zero=True,
                    recompute=False):
    """
    对诊疗次数的统计, 窗口可以是月或全局, 颗粒度天单位
    :param stats_name: str, 统计名
    :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, 或 '1t'粒度为每次
    :param month: str, 需要统计的时间窗口
    :param non_zero: bool, 只统计非0的时间颗粒
    :param recompute: bool,是否重新计算该特征
    :return:
    """
    # 1
    feature_name = '{}_count_in_{}_by_{}_{}'.format(stats_name, month, size,
                                                    non_zero)

    if IsAbsense(feature_name) | recompute:
        # 2 compute feature
        print('compute {}'.format(feature_name))
        # 2.1 读取数据
        train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                            sort_by_time=True)
        train_test_id = pd.concat([train_id, test_id],
                                  axis=0,
                                  ignore_index=True)
        # 2.2 选择需要统计的数据
        train_data, test_data = SelectDataByMonth(train_data, test_data, month)
        train_test_data = pd.concat([train_data, test_data],
                                    axis=0,
                                    ignore_index=True)
        train_test_data['count'] = 1
        # 2.3 计算count df
        stats_df = train_test_data[[
            'PERSONID', 'CREATETIME', 'count'
        ]].groupby('PERSONID').apply(lambda df_person: stats_count_by_size(
            df_person, stats_name, size, non_zero)).to_frame(
                feature_name).reset_index()
        # 2.4 merge 拼接
        train_test_id = train_test_id.merge(stats_df,
                                            on=['PERSONID'],
                                            how='left')
        count_stats_fillna_by_stats_name(train_test_id, feature_name,
                                         stats_name)
        # 2.5 保存特征
        train_id[feature_name] = train_test_id[feature_name][:15000].values
        test_id[feature_name] = train_test_id[feature_name][15000:].values
        SaveFeature(train_id, test_id, feature_name)
        print('Finished Computing {} \n'.format(feature_name))
        return feature_name, 'gen_stats_count("{}", "{}", "{}", {})'.format(
            stats_name, month, size, non_zero)
    else:
        print('The Feature has already been computed \n')
        return feature_name, 'gen_stats_count("{}", "{}", "{}", {})'.format(
            stats_name, month, size, non_zero)
def run(config):
    """
    :param config: dict, 配置字典
    :return:
    """
    # 1 根据配置合并特征
    Xtrain, Ytrain, Xtest = CombineFeature(config['feature_names'])

    # ------------------------
    train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                        sort_by_time=True)
    Xtrain['PERSONID'] = train_id['PERSONID']
    Ytrain['PERSONID'] = train_id['PERSONID']
    Xtest['PERSONID'] = test_id['PERSONID']
    Xtrain.to_csv('Xtrain_xiao.csv', index=False)
    Ytrain.to_csv('Ytrain_xiao.csv', index=False)
    Xtest.to_csv('Xtest_xiao.csv', index=False)

    Xtrain.drop(['PERSONID'], axis=1, inplace=True)
    Ytrain.drop(['PERSONID'], axis=1, inplace=True)
    Xtest.drop(['PERSONID'], axis=1, inplace=True)
    # ------------------------

    # 2 根据配置初始化模型
    model = InitModel(Xtrain, Ytrain, Xtest, config)
    # 3 线下验证
    model.offline_validate()
    # 4 线上预测
    model.online_predict()
    # 保存实验结果
    if config['save_experiment_result']:
        model.save_experiment_result()
    # 6 返回线下验证分数以及显示预测结果

    # 保存模型
    for i, booster in enumerate(model.booster_offline_list):
        booster.save_model('xgb{}.m'.format(i))

    # 连接模型预测
    feature_names = list(Xtest.columns)
    xgb_test = xgb.DMatrix(Xtest[feature_names].values,
                           feature_names=feature_names)

    submission_list = []
    for i, best_iter in enumerate([161, 292, 160, 246, 269]):
        load_model = xgb.Booster(model_file='xgb{}.m'.format(i))
        submission_list.append(
            load_model.predict(xgb_test, ntree_limit=best_iter))
    submission = np.mean(submission_list, axis=0)
    print(np.sum(np.abs(model.submission_online - submission)))

    return model.mean_score_offline, model.submission_online, model.fold_results
def gen_stats_vector_from_cat_vector(stats_name, size, kinds):
    """
    为了train_test_id
    :param stats_name: str, 统计名字
    :param size: str, 时间粒度
    :param kinds: str, 类别变量种类
    :return:
    """

    # 0 读取train_test_data的cat matrix
    print('gen_stats_vector_from_cat_vector("{}", "{}", "{}")'.format(
        stats_name, size, kinds))
    input_matrix_name = '{}_vector_from_ftr51'.format(kinds)
    input_sparse_matrix = sparse.load_npz(
        get_path() +
        'Data/Feature/{}.npz'.format(input_matrix_name)).toarray()
    print('The shape of matrix is ( {}, {}) '.format(
        input_sparse_matrix.shape[0], input_sparse_matrix.shape[1]))
    # 1 读取基本数据
    train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                        sort_by_time=True)
    train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True)
    train_test_data = pd.concat([train_data, test_data],
                                axis=0,
                                ignore_index=True)
    # 2 形成pd.dataframe, 便于分组统计
    input_sparse_df = pd.DataFrame(data=input_sparse_matrix)
    print('2')
    del input_sparse_matrix
    gc.collect()
    input_sparse_df['PERSONID'] = train_test_data['PERSONID']
    input_sparse_df['CREATETIME'] = train_test_data['CREATETIME']

    # 3 开始统计
    output_stats_df = input_sparse_df.groupby('PERSONID').apply(
        lambda df_person: compute_stats_dict_from_cat_matrix(
            df_person, stats_name, size)).to_frame('stats_dict').reset_index()
    print(3)
    train_test_id = train_test_id.merge(output_stats_df,
                                        on=['PERSONID'],
                                        how='left')
    # 4 转化成稀疏矩阵并保存
    v = DictVectorizer()
    # 计算统计向量
    stats_sparse_matrix = v.fit_transform(train_test_id['stats_dict'].values)
    print(4)
    stats_matrix_name = '{}_{}_vector_by_{}'.format(stats_name, kinds, size)
    sparse.save_npz(
        get_path() + 'Data/Feature/{}.npz'.format(stats_matrix_name),
        stats_sparse_matrix)
    return stats_matrix_name, 'gen_stats_vector_from_cat_vector("{}", "{}", "{}")'.format(
        stats_name, size, kinds)
def gen_rolling_stats_count(size, stats_name='sumratio2max', recompute=False):
    """
    对诊疗次数进行滑窗统计, 窗口可以是月或全局, 颗粒度天单位
    :param stats_name: str, 统计方法
    :param size: str, 下采样时间间隔, 类似'xd',粒度为x天,
    :param recompute: bool,是否重新计算该特征
    :return:
    """
    # 1
    feature_name = 'rolling_{}_count_{}'.format(stats_name, size)

    if IsAbsense(feature_name) | recompute:
        # 2 compute feature
        print('compute {}'.format(feature_name))
        # 2.1 读取数据
        train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                            sort_by_time=True)
        train_test_id = pd.concat([train_id, test_id],
                                  axis=0,
                                  ignore_index=True)
        train_test_data = pd.concat([train_data, test_data],
                                    axis=0,
                                    ignore_index=True)
        train_test_data['count'] = 1
        # 2.3 计算count df
        stats_df = train_test_data[[
            'PERSONID', 'CREATETIME', 'count'
        ]].groupby('PERSONID').apply(lambda df_person: rolling_stats_count(
            df_person, stats_name, size)).to_frame(feature_name).reset_index()
        # 2.4 merge 拼接
        train_test_id = train_test_id.merge(stats_df,
                                            on=['PERSONID'],
                                            how='left')
        # 2.5 保存特征
        train_id[feature_name] = train_test_id[feature_name][:15000].values
        test_id[feature_name] = train_test_id[feature_name][15000:].values
        SaveFeature(train_id, test_id, feature_name)
        print('Finished Computing {} \n'.format(feature_name))
        return feature_name, 'gen_rolling_stats_count("{}", "{}")'.format(
            size, stats_name)
    else:
        print('The Feature has already been computed \n')
        return feature_name, 'gen_rolling_stats_count("{}", "{}")'.format(
            size, stats_name)
def gen_stats_vector_ftr51(stats_name, size='7d', non_zero=False):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d
    :param non_zero: bool, 统计是否非0
    :return:
    """
    assert stats_name in [
        'sum', 'sum_ratio', 'max', 'max_ratio', 'mean', 'std'
    ]
    mask = (stats_name in ['sum', 'sum_ratio', 'max', 'max_ratio']) & non_zero
    assert not mask

    matrix_name = '{}_vector_ftr51_by_{}_{}'.format(stats_name, size, non_zero)
    # 0 读取数据
    train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                        sort_by_time=True)
    train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True)
    train_test_data = pd.concat([train_data, test_data],
                                axis=0,
                                ignore_index=True)
    # 计算统计字典
    print('1 computing stats dict {}'.format(size))
    ftr51_stats_dict_df = train_test_data[[
        'PERSONID', 'CREATETIME', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: stats_FTR51_by_size(
        df_person, stats_name, size, non_zero)).to_frame(
            'stats_dict').reset_index()
    train_test_id = train_test_id.merge(ftr51_stats_dict_df,
                                        on=['PERSONID'],
                                        how='left')
    v = DictVectorizer()
    # 计算统计向量
    print('2 computing stats vector'.format(size))
    ftr51_stats_sparse_matrix = v.fit_transform(
        train_test_id['stats_dict'].values)
    joblib.dump(v, 'v_{}_{}.m'.format(stats_name, size))
    sparse.save_npz(get_path() + 'Data/Feature/{}.npz'.format(matrix_name),
                    ftr51_stats_sparse_matrix)

    return matrix_name, 'gen_stats_vector_ftr51("{}", "{}", {})'.format(
        stats_name, size, non_zero)
def gen_(month, recompute=False):
    # 1
    feature_name = ''


    if IsAbsense(feature_name) | recompute:
        # 2 compute feature
        print('compute {}'.format(feature_name))
        # 2.1 读取数据
        train_id, test_id, train_data, test_data, Ytrain = ReadData(Ytrain=True)
        train_id['LABEL'] = Ytrain['LABEL'].values
        train_data = train_data.merge(train_id, on=['PERSONID'], how='left')
        # 2.2 选择需要统计的数据
        train_data, test_data = SelectDataByMonth(train_data, test_data, month)
        # 如果本月未出现
        train_id[feature_name] = train_id[feature_name].fillna(0)
        test_id[feature_name] = test_id[feature_name].fillna(0)
        # 保存特征
        SaveFeature(train_id, test_id, feature_name)
        print('Finished Computing {} \n'.format(feature_name))
        return feature_name, 'gen_stats_woe_OfPerson_by_columns({}, "{}", {})'.format(feature_list, agg_name, False)
    else:
        print('The Feature has already been computed \n')
        return feature_name, 'gen_stats_woe_OfPerson_by_columns({}, "{}", {})'.format(feature_list, agg_name, False)
def gen_stats_value_ftr51_in_month(month='month3', stats_name='count_ratio_range'):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 , 7d, 15d, 30d, 45d
    :return:
    """
    # ['nunique', 'nunique_ratio', 'len', 'count_std', 'count_max', 'count_range', 'count_ratio_std', 'count_ratio_max', 'count_ratio_range']
    # pdb.set_trace()
    feature_name = '{}_ftr51_in_{}'.format(stats_name, month)
    # 0 读取数据
    train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True)
    train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True)
    train_data, test_data = SelectDataByMonth(train_data, test_data, month)
    train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
    # 计算统计字典
    print('1 computing stats value of ftr51 in  {}'.format(month))
    ftr51_stats_value_df = train_test_data[['PERSONID', 'CREATETIME', 'FTR51']].groupby('PERSONID').apply(
        lambda df_person: compute_stats_value_FTR51_in_month(df_person, stats_name)).to_frame(feature_name).reset_index()
    train_test_id = train_test_id.merge(ftr51_stats_value_df, on=['PERSONID'], how='left')
    train_id[feature_name] = train_test_id[feature_name][:15000].values
    test_id[feature_name] = train_test_id[feature_name][15000:].values
    SaveFeature(train_id, test_id, feature_name)
    print('Finished Computing {} \n'.format(feature_name))
    return feature_name, 'gen_stats_value_ftr51("{}", "{}")'.format(stats_name, month)
def gen_fraud_ratio_feature(kinds='B'):
    """
    :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合
    :return:
    """
    # 0 读取数据
    train_id, test_id, train_data, test_data, Ytrain = ReadData(
        Ytrain=True, sort_by_time=True)
    train_id['LABEL'] = Ytrain['LABEL']
    train_data = train_data.merge(train_id, on=['PERSONID'], how='left')
    train_id = train_id.drop(['LABEL'], axis=1)
    # 1 个人计数
    df_cat_person_count = train_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left')
    # 2 个人欺诈计数
    mask = train_data['LABEL'] == 1
    df_cat_person_fraud = train_data[mask][[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('fraud_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left')
    # ---------------------------------------- 好深的bug
    # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0,
    train_id['fraud_dict_person'] = train_id[[
        'count_dict_person', 'fraud_dict_person'
    ]].apply(lambda x: repair_fraud_dict_person(x), axis=1)
    # ---------------------------------------  好深的bug
    # 3 所有计数
    ftr51s_all = ','.join(list(train_data['FTR51'].values))
    count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds)
    # 4 所有欺诈
    ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values))
    fraud_dict_all = compute_cat_count_dict_from_ftr51s(
        ftr51s_all_fraud, kinds)
    fraud_dict_all = {
        key: fraud_dict_all.setdefault(key, 0)
        for key in count_dict_all.keys()
    }
    # 5 赋值
    train_id['count_dict_all'] = [
        count_dict_all for _ in range(train_id.shape[0])
    ]
    train_id['fraud_dict_all'] = [
        fraud_dict_all for _ in range(train_id.shape[0])
    ]
    # 6 oob dict
    train_id['count_dict_oob'] = train_id[[
        'count_dict_all', 'count_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']),
        axis=1)

    train_id['fraud_dict_oob'] = train_id[[
        'fraud_dict_all', 'fraud_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']),
        axis=1)
    # 7 cat fraud  ratio dict
    # train
    train_id['cat_fraud_ratio_dict_oob'] = train_id[[
        'count_dict_oob', 'fraud_dict_oob'
    ]].apply(lambda s: division_dict(s['count_dict_oob'], s['fraud_dict_oob']),
             axis=1)
    # test
    cat_fraud_ratio_dict_all = division_dict(count_dict_all, fraud_dict_all)
    test_id['cat_fraud_ratio_dict_oob'] = [
        cat_fraud_ratio_dict_all for _ in range(test_id.shape[0])
    ]
    count_dict_person_test = test_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    test_id = test_id.merge(count_dict_person_test,
                            on=['PERSONID'],
                            how='left')
    test_id['cat_fraud_ratio_dict_oob'] = test_id.apply(lambda x: {
        key: x['cat_fraud_ratio_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                                        axis=1)

    # 利用cat的欺诈比生成个人的特征
    # 8 max_fraud_ratio 特征
    train_id['max_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max())
    test_id['max_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max())

    #  9 sum_fraud_ratio 特征
    train_id['sum_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum())
    test_id['sum_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum())

    # 10  mean_fraud_ratio 特征
    train_id['mean_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean())
    test_id['mean_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean())

    # 11 保存特征, 查看分布
    for feat in ['max_fraud_ratio', 'sum_fraud_ratio', 'mean_fraud_ratio']:
        SaveFeature(train_id, test_id, feat)
        IsDifferentDistribution(feat)
Beispiel #12
0
import matplotlib.pyplot as plt
import numpy as np
import sys
from sklearn import svm
from utils.ReadData import *
from utils.Calculations import *

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Format: python GaussianSVM.py TrainData TestData")
        exit(0)

    # Read Data
    TrainX, TrainY = ReadData(sys.argv[1])
    TestX, TestY = ReadData(sys.argv[2])

    # Transform Class(Digit)
    TrainY = TransformData(TrainY, 0)
    TestY = TransformData(TestY, 0)

    # Start Training~~
    logC = [-2, -1, 0, 1, 2]
    TrackDis = []
    for i in logC:
        C = 10**i
        GaussianSVM = svm.SVC(kernel="rbf", gamma=80, C=C)
        GaussianSVM.fit(TrainX, TrainY)
        W = TrainY[GaussianSVM.support_] * GaussianSVM.dual_coef_[0]
        TrackDis.append(cal_Distance(W))

    # Plot the graph
Beispiel #13
0
from utils.Calculation import *

def RidgeRegression(X, Y, Lambda = 1):
	W = CalPseudoinverse(X, Lambda) @ X.T @ Y
	return W


if __name__ == "__main__":
	if len(sys.argv) != 3:
		print("Format: python RidgeRegression.py DataPath lambda")
		exit(0)

	Lambda = float(sys.argv[2])

	# Read Data
	X, Y = ReadData(sys.argv[1])

	# Cut train and test data
	TrainX, TestX = X[:400], X[400:]
	TrainY, TestY = Y[:400], Y[400:]

	# Run RidgeRegression
	W = RidgeRegression(TrainX, TrainY, Lambda)

	Clf = Ridge(alpha = Lambda)
	Clf.fit(TrainX, TrainY)

	# Calculate error
	E_in = CalError(W, TrainX, TrainY)
	E_out = CalError(W, TestX, TestY)
	print("My E_in:", E_in)
Beispiel #14
0
def gen_fraud_ratio_feature(kinds='E', stats_name='fraud_ratio_mean_weight'):
    """
    计算一个人所有的cat, 计算cat oob 的count, fraud, 例如某欺诈用户如果B1一次记录出现两次,则B1 fraud +2, count +2,
    利用count, fraud 计算统计值
    :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合
    :return:
    """
    feature_name = '{}_{}'.format(stats_name, kinds)
    print('computing feature {}'.format(feature_name))
    # 0 读取数据
    train_id, test_id, train_data, test_data, Ytrain = ReadData(
        Ytrain=True, sort_by_time=True)
    train_id['LABEL'] = Ytrain['LABEL']
    train_data = train_data.merge(train_id, on=['PERSONID'], how='left')
    train_id = train_id.drop(['LABEL'], axis=1)
    # 1 个人计数
    df_cat_person_count = train_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left')
    # 2 个人欺诈计数
    mask = train_data['LABEL'] == 1
    df_cat_person_fraud = train_data[mask][[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('fraud_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left')
    # ---------------------------------------- 好深的bug
    # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0,
    train_id['fraud_dict_person'] = train_id[[
        'count_dict_person', 'fraud_dict_person'
    ]].apply(lambda x: repair_fraud_dict_person(x), axis=1)
    # ---------------------------------------  好深的bug
    # 3 所有计数
    ftr51s_all = ','.join(list(train_data['FTR51'].values))
    count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds)
    # 4 所有欺诈
    ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values))
    fraud_dict_all = compute_cat_count_dict_from_ftr51s(
        ftr51s_all_fraud, kinds)
    fraud_dict_all = {
        key: fraud_dict_all.setdefault(key, 0)
        for key in count_dict_all.keys()
    }
    # 5 赋值
    train_id['count_dict_all'] = [
        count_dict_all for _ in range(train_id.shape[0])
    ]
    train_id['fraud_dict_all'] = [
        fraud_dict_all for _ in range(train_id.shape[0])
    ]

    # 6 oob dict
    train_id['count_dict_oob'] = train_id[[
        'count_dict_all', 'count_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']),
        axis=1)

    train_id['fraud_dict_oob'] = train_id[[
        'fraud_dict_all', 'fraud_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']),
        axis=1)

    count_dict_person_test = test_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    test_id = test_id.merge(count_dict_person_test,
                            on=['PERSONID'],
                            how='left')
    test_id['fraud_dict_oob'] = [
        fraud_dict_all for _ in range(test_id.shape[0])
    ]
    test_id['count_dict_oob'] = [
        count_dict_all for _ in range(test_id.shape[0])
    ]

    test_id['count_dict_oob'] = test_id.apply(lambda x: {
        key: x['count_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                              axis=1)
    test_id['fraud_dict_oob'] = test_id.apply(lambda x: {
        key: x['fraud_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                              axis=1)

    # 统计计算特征

    train_id[feature_name] = train_id.apply(
        lambda s: stats_by_oob_dict(s, stats_name), axis=1)
    test_id[feature_name] = test_id.apply(
        lambda s: stats_by_oob_dict(s, stats_name), axis=1)
    SaveFeature(train_id, test_id, feature_name)
    IsDifferentDistribution(feature_name)
Beispiel #15
0
		return

	print(root.feature, root.theta, root.isleaf, root.label)

	PrintTree(root.left)
	PrintTree(root.right)


if __name__ == "__main__":
	if len(sys.argv) != 3:
		print("Format: python CARTDecisionTree.py TrainData TestData")
		exit(0)

	# Read in data
	TrainFile, TestFile = sys.argv[1:3]
	TrainData = ReadData(TrainFile)
	TestData = ReadData(TestFile)

	# Train CART decision tree
	CARTDTree = CART(TrainData)

	# Predict train data
	Prediction = TreePrediction(CARTDTree, TrainData)
	Ein = CalculateError(TrainData, Prediction)
	print("Ein:", Ein)

	# Predict test data
	Prediction = TreePrediction(CARTDTree, TestData)
	Eout = CalculateError(TestData, Prediction)
	print("Eout:", Eout)
import os
import numpy as np
import pandas as pd
import settings

from utils import ReadData, Plot, Enhance, RunLengthEncoder, ImageSegment
from keras.callbacks import EarlyStopping, ModelCheckpoint
from skimage.transform import resize
from skimage.util import random_noise
import sys

model_name = 'model-dsbowl-2018.h5'

settings.init()

data = ReadData()
data.train_data()
data.test_data()

# get train and test data
X_train = data.X_train
Y_train = data.Y_train
X_test = data.X_test
test_sizes = data.test_sizes

# enhance train data
enhanced = Enhance(X_train, Y_train)
enhanced.enhance()

X_train = enhanced.X_out
Y_train = enhanced.Y_out