def gen_stats_cost_by_non_zero_group(cost, stats_name='mean_mean', size='7d', recompute=False): """ 对诊疗次数的统计, 窗口可以是月或全局, 颗粒度天单位 :param cost: str, 项目名称 :param stats_name: str, 统计名 :param df_person: :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, 或 '1t'粒度为每次 :param recompute: bool,是否重新计算该特征 :return: """ # 1 # ['len_max', 'len_max_ratio', 'len_mean', 'len_std', 'len_count', # 'sum_max', 'sum_max_ratio', 'sum_mean', 'sum_std', # 'mean_max', 'mean_std', 'mean_mean'] feature_name = '{}_{}_by_non_zero_group_{}'.format(stats_name, cost, size) if IsAbsense(feature_name) | recompute: # 2 compute feature print('compute {}'.format(feature_name)) # 2.1 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 2.3 计算count df stats_df = train_test_data[['PERSONID', 'CREATETIME', cost]].groupby('PERSONID').apply( lambda df_person: stats_cost_by_non_zero_group(df_person, cost, stats_name, size)).to_frame(feature_name).reset_index() # 2.4 merge 拼接 train_test_id = train_test_id.merge(stats_df, on=['PERSONID'], how='left') # 2.5 保存特征 train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_cost_by_non_zero_group("{}", "{}", "{}")'.format(cost, stats_name, size) else: print('The Feature has already been computed \n') return feature_name, 'gen_stats_cost_by_non_zero_group("{}", "{}", "{}")'.format(cost, stats_name, size)
def gen_stats_value_ftr51(stats_name, size='400d'): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 , 7d, 15d, 30d, 45d :return: """ feature_name = '{}_ftr51_by_{}'.format(stats_name, size) # 0 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 计算统计字典 print('1 computing stats value of ftr51 by {}'.format(size)) ftr51_stats_value_df = train_test_data[[ 'PERSONID', 'CREATETIME', 'FTR51' ]].groupby('PERSONID').apply( lambda df_person: compute_stats_value_FTR51_by_size( df_person, stats_name, size)).to_frame(feature_name).reset_index() train_test_id = train_test_id.merge(ftr51_stats_value_df, on=['PERSONID'], how='left') train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_value_ftr51("{}", "{}")'.format( stats_name, size)
def gen_time_feature(stats_name, version='v1', kind='train', agg='mean'): """ :param stats_name: :param version: :param kind: :return: """ # 0 被统计的值 values = [ 'SBP', 'DBP', 'HEART_RATE_TIMES', 'GLU', 'HEIGHT', 'WEIGHT', 'BMI' ] # 1 读取历史数据 followup = ReadHistData(info='followup_person_info', version=version, kind=kind) labels = ReadLabelsData(version, kind) # 2 计算特征 labels['stats_dict'] = labels.apply( lambda label: compute_time_feature_dict( filter_hist_data(label, followup), values, stats_name, agg), axis=1) v = DictVectorizer() stats_matrix = v.fit_transform(labels['stats_dict'].values).toarray() value_names = v.get_feature_names() feature_names = [ '{}_{}_{}'.format(value_name, stats_name, agg) for value_name in value_names ] stats_df = pd.DataFrame(data=stats_matrix, columns=feature_names) labels = pd.concat([labels, stats_df], axis=1) # 3 保存特征 for feat in feature_names: SaveFeature(labels, feat, version, kind) return feature_names
def gen_stats_count(stats_name, month='global', size='1d', non_zero=True, recompute=False): """ 对诊疗次数的统计, 窗口可以是月或全局, 颗粒度天单位 :param stats_name: str, 统计名 :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, 或 '1t'粒度为每次 :param month: str, 需要统计的时间窗口 :param non_zero: bool, 只统计非0的时间颗粒 :param recompute: bool,是否重新计算该特征 :return: """ # 1 feature_name = '{}_count_in_{}_by_{}_{}'.format(stats_name, month, size, non_zero) if IsAbsense(feature_name) | recompute: # 2 compute feature print('compute {}'.format(feature_name)) # 2.1 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) # 2.2 选择需要统计的数据 train_data, test_data = SelectDataByMonth(train_data, test_data, month) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) train_test_data['count'] = 1 # 2.3 计算count df stats_df = train_test_data[[ 'PERSONID', 'CREATETIME', 'count' ]].groupby('PERSONID').apply(lambda df_person: stats_count_by_size( df_person, stats_name, size, non_zero)).to_frame( feature_name).reset_index() # 2.4 merge 拼接 train_test_id = train_test_id.merge(stats_df, on=['PERSONID'], how='left') count_stats_fillna_by_stats_name(train_test_id, feature_name, stats_name) # 2.5 保存特征 train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_count("{}", "{}", "{}", {})'.format( stats_name, month, size, non_zero) else: print('The Feature has already been computed \n') return feature_name, 'gen_stats_count("{}", "{}", "{}", {})'.format( stats_name, month, size, non_zero)
def gen_base_feature2(version, kind='train'): # 1 读取历史数据 followup = ReadHistData(info='followup_person_info', version=version, kind=kind) labels = ReadLabelsData(version, kind) # 2 第一次随访的age labels = labels.merge(followup.groupby('ID')['DATE_OF_BIRTH'].max().reset_index(), on='ID', how='left') first_followup_time_df = followup.groupby('ID')['FOLLOWUP_DATE'].min().to_frame('first_followup_time').reset_index() labels = labels.merge(first_followup_time_df, on='ID', how='left') labels['first_followup_age'] = (labels['first_followup_time'] - labels['DATE_OF_BIRTH']).dt.days SaveFeature(labels, 'first_followup_age', version, kind) # 3 第一次随访到确认高血压的时间 labels = labels.merge(followup.groupby('ID')['CONFIRM_DATE'].max().reset_index(), on='ID', how='left') labels['first_followup_time_diff_confirm_time'] = (labels['first_followup_time'] - labels['CONFIRM_DATE']).dt.days SaveFeature(labels, 'first_followup_time_diff_confirm_time', version, kind) # 4 当前时间到随访的时间 labels['TimePoint_diff_first_followup_time'] = (labels['TimePoint'] - labels['first_followup_time']).dt.days SaveFeature(labels, 'TimePoint_diff_first_followup_time', version, kind)
def gen_base_feature(version, kind='train'): """ 计算年龄、性别、确认时长 :param kind: :return: """ # 1 读取历史数据 followup = ReadHistData(info='followup_person_info', version=version, kind=kind) labels = ReadLabelsData(version, kind) # 1 性别 labels = labels.merge(followup.groupby('ID')['SEX_CODE'].max().to_frame('SEX_CODE').reset_index(), on=['ID'], how='left') SaveFeature(labels, 'SEX_CODE', version, kind) # 2 在时间点的年龄 labels['age'] = labels.apply(lambda label: compute_age(filter_hist_data(label, followup), label), axis=1) SaveFeature(labels, 'age', version, kind) # 3 确认高血压时的年龄 labels = labels.merge( followup.groupby('ID').apply(lambda df_person: (df_person['CONFIRM_DATE'].max() - df_person['DATE_OF_BIRTH'].max()).days).to_frame('confirm_age').reset_index(), on=['ID'], how='left') SaveFeature(labels, 'confirm_age', version, kind) # 4 时间点与确认高血压时间的差 labels['time_diff_confirm_2TimePoint'] = (labels['age'] - labels['confirm_age']) SaveFeature(labels, 'time_diff_confirm_2TimePoint', version, kind)
def gen_decomposition_stats_vector_from_cat_vector(stats_name, kinds, size='30d', decomp_method='lda', n_components=20): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d :param decomp_method: str, 分解方法 :param n_components: int , 分解之后的维度 :return: """ assert decomp_method in ['svd', 'nmf', 'lda'] stats_matrix_name = '{}_{}_vector_by_{}'.format(stats_name, kinds, size) # 0 读取数据 stats_sparse_matrix = sparse.load_npz(get_path() + 'Data/Feature/{}.npz'.format(stats_matrix_name)).toarray() print(0) if decomp_method == 'svd': print(' svd decomposition...') svd = TruncatedSVD(n_components=n_components, n_iter=50, random_state=42) stats_matrix_decomp = svd.fit_transform(stats_sparse_matrix) if decomp_method == 'nmf': print(' nmf decomposition...') nmf = NMF(n_components=n_components, init='random', random_state=0, max_iter=200) stats_matrix_decomp = nmf.fit_transform(stats_sparse_matrix) if decomp_method == 'lda': print(' lda decomposition...') lda = LatentDirichletAllocation(n_components=n_components, max_iter=50, learning_method='online', learning_offset=50., random_state=0, n_jobs=-1) stats_matrix_decomp = lda.fit_transform(stats_sparse_matrix) print(1) n = stats_matrix_decomp.shape[1] columns = ['{}_{}_{}_vector_by_{}_{}_{}'.format(decomp_method, stats_name, kinds, size, n_components, j) for j in range(n)] stats_df = pd.DataFrame(data=stats_matrix_decomp, columns=columns) print(2) train = stats_df[:15000].reset_index(drop=True) test = stats_df[15000:].reset_index(drop=True) for feature in columns: SaveFeature(train, test, feature) return columns, 'gen_decomposition_stats_vector_from_cat_vector("{}", "{}", "{}", "{}", {})'.format(stats_name, kinds, size, decomp_method, n_components)
def gen_rolling_stats_count(size, stats_name='sumratio2max', recompute=False): """ 对诊疗次数进行滑窗统计, 窗口可以是月或全局, 颗粒度天单位 :param stats_name: str, 统计方法 :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, :param recompute: bool,是否重新计算该特征 :return: """ # 1 feature_name = 'rolling_{}_count_{}'.format(stats_name, size) if IsAbsense(feature_name) | recompute: # 2 compute feature print('compute {}'.format(feature_name)) # 2.1 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) train_test_data['count'] = 1 # 2.3 计算count df stats_df = train_test_data[[ 'PERSONID', 'CREATETIME', 'count' ]].groupby('PERSONID').apply(lambda df_person: rolling_stats_count( df_person, stats_name, size)).to_frame(feature_name).reset_index() # 2.4 merge 拼接 train_test_id = train_test_id.merge(stats_df, on=['PERSONID'], how='left') # 2.5 保存特征 train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_rolling_stats_count("{}", "{}")'.format( size, stats_name) else: print('The Feature has already been computed \n') return feature_name, 'gen_rolling_stats_count("{}", "{}")'.format( size, stats_name)
def gen_action_time_feature(stats_name, version='v2', kind='train'): """ :param stats_name: :param version: :param kind: :return: """ # 1 读取历史数据 followup = ReadHistData(info='followup_person_info', version=version, kind=kind) labels = ReadLabelsData(version, kind) # 2 计算特征 labels[stats_name] = labels.apply( lambda label: compute_action_time_feature( filter_hist_data(label, followup), stats_name), axis=1) SaveFeature(labels, stats_name, version, kind) return stats_name
def gen_missing_ratio(value, version='v1', kind='train'): """ :param value: :param version: :param kind: :return: """ # 0 特征名 feature_name = '{}_missing_ratio'.format(value) # 1 读取历史数据 followup = ReadHistData(info='followup_person_info', version=version, kind=kind) labels = ReadLabelsData(version, kind) # 2 计算特征 labels[feature_name] = labels.apply(lambda label: compute_missing_ratio( filter_hist_data(label, followup), value), axis=1) # 3 保存特征 SaveFeature(labels, feature_name, version, kind) return
def gen_(month, recompute=False): # 1 feature_name = '' if IsAbsense(feature_name) | recompute: # 2 compute feature print('compute {}'.format(feature_name)) # 2.1 读取数据 train_id, test_id, train_data, test_data, Ytrain = ReadData(Ytrain=True) train_id['LABEL'] = Ytrain['LABEL'].values train_data = train_data.merge(train_id, on=['PERSONID'], how='left') # 2.2 选择需要统计的数据 train_data, test_data = SelectDataByMonth(train_data, test_data, month) # 如果本月未出现 train_id[feature_name] = train_id[feature_name].fillna(0) test_id[feature_name] = test_id[feature_name].fillna(0) # 保存特征 SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_woe_OfPerson_by_columns({}, "{}", {})'.format(feature_list, agg_name, False) else: print('The Feature has already been computed \n') return feature_name, 'gen_stats_woe_OfPerson_by_columns({}, "{}", {})'.format(feature_list, agg_name, False)
def gen_isolationforest(): pdb.set_trace() feature_name = 'iso_forest_score' # 数据准备 log = ReadExperimentLog(43) config = log['config'] Xtrain, Ytrain, Xtest = CombineFeature(config['feature_names']) train_test_feature = pd.concat([Xtrain, Xtest], axis=0, ignore_index=True) # clf = IsolationForest(n_estimators=500, random_state=42) clf.fit(train_test_feature[config['feature_names']].values) train_test_feature[feature_name] = clf.decision_function( train_test_feature[config['feature_names']].values) # Xtrain[feature_name] = train_test_feature[feature_name][:15000].values Xtest[feature_name] = train_test_feature[feature_name][15000:].values SaveFeature(Xtrain, Xtest, feature_name) IsDifferentDistribution(feature_name) return
def gen_stats_value_ftr51_in_month(month='month3', stats_name='count_ratio_range'): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 , 7d, 15d, 30d, 45d :return: """ # ['nunique', 'nunique_ratio', 'len', 'count_std', 'count_max', 'count_range', 'count_ratio_std', 'count_ratio_max', 'count_ratio_range'] # pdb.set_trace() feature_name = '{}_ftr51_in_{}'.format(stats_name, month) # 0 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_data, test_data = SelectDataByMonth(train_data, test_data, month) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 计算统计字典 print('1 computing stats value of ftr51 in {}'.format(month)) ftr51_stats_value_df = train_test_data[['PERSONID', 'CREATETIME', 'FTR51']].groupby('PERSONID').apply( lambda df_person: compute_stats_value_FTR51_in_month(df_person, stats_name)).to_frame(feature_name).reset_index() train_test_id = train_test_id.merge(ftr51_stats_value_df, on=['PERSONID'], how='left') train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_value_ftr51("{}", "{}")'.format(stats_name, month)
def fill_na(value, train_version='v2', test_version='v2_1'): """ :param value: str, 被预测的指标 :param train_version: list, 训练回归模型的特征版本 :param test_version: str, 被预测填充的测试版本 :return: """ # 1 不同的测试子集使用不同的训练特征来预测 if test_version == 'v2_1': feature_names = [ 'SEX_CODE', 'age', 'confirm_age', 'time_diff_confirm_2TimePoint' ] elif test_version in ['v2_2', 'v2_3', 'v2_4']: feature_names = [ 'SEX_CODE', 'age', 'confirm_age', 'time_diff_confirm_2TimePoint', 'first_followup_age', 'first_followup_time_diff_confirm_time', 'TimePoint_diff_first_followup_time' ] else: assert False # 2 读取标签 Ytrain = pd.read_pickle( get_path_feature() + '{}_{}_{}.pkl'.format(train_version, value, 'train')) # 3.1 读取训练集合测试集 Xtrain = pd.read_pickle( get_path_feature() + '{}_{}_{}.pkl'.format(train_version, feature_names[0], 'train')) print('the shape of Xtrain is', Xtrain.shape) Xtest = pd.read_pickle( get_path_feature() + '{}_{}_{}.pkl'.format(test_version, feature_names[0], 'test')) print('the shape of Xtest is', Xtest.shape) for feat in feature_names[1:]: train_feature = pd.read_pickle( get_path_feature() + '{}_{}_{}.pkl'.format(train_version, feat, 'train')) print('the shape of train feature is', train_feature.shape) Xtrain = Xtrain.merge(train_feature, on=['ID', 'TimePoint', 'version'], how='left') print('the shape of Xtrain is', Xtrain.shape) test_feature = pd.read_pickle( get_path_feature() + '{}_{}_{}.pkl'.format(test_version, feat, 'test')) print('the shape of test feature is', test_feature.shape) Xtest = Xtest.merge(test_feature, on=['ID', 'TimePoint', 'version'], how='left') print('the shape of Xtest is', Xtest.shape) # 3.2 不要使用有缺失值的样本 mask1 = Ytrain[value] != -9999 mask2 = Ytrain[value] != -99999 mask = mask1 & mask2 Xtrain = Xtrain[mask].reset_index(drop=True) Ytrain = Ytrain[mask].reset_index(drop=True) # 4 数据准备完毕, 开始训练预测 clf = xgb.XGBRegressor(max_depth=3, learning_rate=0.03, n_estimators=200, silent=True, objective='reg:linear') clf.fit(Xtrain[feature_names].values, Ytrain[value].values, eval_metric='rmse') y_pred = clf.predict(Xtest[feature_names].values) # 5 根据是否存在,或者存在但是为NA if os.path.exists(get_path_feature() + get_path_feature() + '{}_{}_{}.pkl'.format(test_version, value, 'test')): Ytest = pd.read_pickle( get_path_feature() + '{}_{}_{}.pkl'.format(test_version, value, 'test')) print('the shape of Ytest is', Ytest.test) for i in range(Ytest.shape[0]): if pd.isna(Ytest[value][i]): Ytest[value][i] = y_pred[i] else: pass Xtest = Xtest.merge(Ytest, on=['ID', 'TimePoint', 'version'], how='left') else: Xtest[value] = y_pred # 6 预测完毕,保重! SaveFeature(feat_df=Xtest, feature_name=value, version=test_version, kind='test')
def gen_fraud_ratio_feature(kinds='B'): """ :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合 :return: """ # 0 读取数据 train_id, test_id, train_data, test_data, Ytrain = ReadData( Ytrain=True, sort_by_time=True) train_id['LABEL'] = Ytrain['LABEL'] train_data = train_data.merge(train_id, on=['PERSONID'], how='left') train_id = train_id.drop(['LABEL'], axis=1) # 1 个人计数 df_cat_person_count = train_data[[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('count_dict_person').reset_index() train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left') # 2 个人欺诈计数 mask = train_data['LABEL'] == 1 df_cat_person_fraud = train_data[mask][[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('fraud_dict_person').reset_index() train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left') # ---------------------------------------- 好深的bug # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0, train_id['fraud_dict_person'] = train_id[[ 'count_dict_person', 'fraud_dict_person' ]].apply(lambda x: repair_fraud_dict_person(x), axis=1) # --------------------------------------- 好深的bug # 3 所有计数 ftr51s_all = ','.join(list(train_data['FTR51'].values)) count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds) # 4 所有欺诈 ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values)) fraud_dict_all = compute_cat_count_dict_from_ftr51s( ftr51s_all_fraud, kinds) fraud_dict_all = { key: fraud_dict_all.setdefault(key, 0) for key in count_dict_all.keys() } # 5 赋值 train_id['count_dict_all'] = [ count_dict_all for _ in range(train_id.shape[0]) ] train_id['fraud_dict_all'] = [ fraud_dict_all for _ in range(train_id.shape[0]) ] # 6 oob dict train_id['count_dict_oob'] = train_id[[ 'count_dict_all', 'count_dict_person' ]].apply( lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']), axis=1) train_id['fraud_dict_oob'] = train_id[[ 'fraud_dict_all', 'fraud_dict_person' ]].apply( lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']), axis=1) # 7 cat fraud ratio dict # train train_id['cat_fraud_ratio_dict_oob'] = train_id[[ 'count_dict_oob', 'fraud_dict_oob' ]].apply(lambda s: division_dict(s['count_dict_oob'], s['fraud_dict_oob']), axis=1) # test cat_fraud_ratio_dict_all = division_dict(count_dict_all, fraud_dict_all) test_id['cat_fraud_ratio_dict_oob'] = [ cat_fraud_ratio_dict_all for _ in range(test_id.shape[0]) ] count_dict_person_test = test_data[[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('count_dict_person').reset_index() test_id = test_id.merge(count_dict_person_test, on=['PERSONID'], how='left') test_id['cat_fraud_ratio_dict_oob'] = test_id.apply(lambda x: { key: x['cat_fraud_ratio_dict_oob'].setdefault(key, 0) for key in x['count_dict_person'].keys() }, axis=1) # 利用cat的欺诈比生成个人的特征 # 8 max_fraud_ratio 特征 train_id['max_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max()) test_id['max_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max()) # 9 sum_fraud_ratio 特征 train_id['sum_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum()) test_id['sum_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum()) # 10 mean_fraud_ratio 特征 train_id['mean_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean()) test_id['mean_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean()) # 11 保存特征, 查看分布 for feat in ['max_fraud_ratio', 'sum_fraud_ratio', 'mean_fraud_ratio']: SaveFeature(train_id, test_id, feat) IsDifferentDistribution(feat)
def gen_decomposition_stats_vector_ftr51(stats_name, size='7d', non_zero=False, decomp_method='lda', n_components=5): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d :param non_zero: bool, 统计是否非0 :param decomp_method: str, 分解方法 :param n_components: int , 分解之后的维度 :return: """ assert decomp_method in ['svd', 'nmf', 'lda'] mask = (stats_name in ['sum', 'max', 'sum_ratio', 'max_ratio']) & non_zero assert not mask matrix_name = '{}_vector_ftr51_by_{}_{}'.format(stats_name, size, non_zero) # 0 读取数据 ftr51_stats_sparse_matrix = sparse.load_npz( get_path() + 'Data/Feature/{}.npz'.format(matrix_name)).toarray() if decomp_method == 'svd': print(' svd decomposition...') svd = TruncatedSVD(n_components=n_components, n_iter=50, random_state=42) ftr51_stats_matrix_decomp = svd.fit_transform( ftr51_stats_sparse_matrix) if decomp_method == 'nmf': print(' nmf decomposition...') nmf = NMF(n_components=n_components, init='random', random_state=0, max_iter=200) ftr51_stats_matrix_decomp = nmf.fit_transform( ftr51_stats_sparse_matrix) if decomp_method == 'lda': print(' lda decomposition...') lda = LatentDirichletAllocation(n_components=n_components, max_iter=50, learning_method='online', learning_offset=50., random_state=0, n_jobs=1) ftr51_stats_matrix_decomp = lda.fit_transform( ftr51_stats_sparse_matrix) joblib.dump(lda, "lda_{}_{}.m".format(stats_name, size)) columns = [ '{}_{}_vector_by_{}_{}_{}_{}'.format(decomp_method, stats_name, size, non_zero, n_components, j) for j in range(ftr51_stats_matrix_decomp.shape[1]) ] stats_df = pd.DataFrame(data=ftr51_stats_matrix_decomp, columns=columns) train = stats_df[:15000].reset_index(drop=True) test = stats_df[15000:].reset_index(drop=True) for feature in columns: SaveFeature(train, test, feature) return columns, 'gen_decomposition_stats_vector_ftr51("{}", "{}", {}, "{}", {})'.format( stats_name, size, non_zero, decomp_method, n_components)
def gen_fraud_ratio_feature(kinds='E', stats_name='fraud_ratio_mean_weight'): """ 计算一个人所有的cat, 计算cat oob 的count, fraud, 例如某欺诈用户如果B1一次记录出现两次,则B1 fraud +2, count +2, 利用count, fraud 计算统计值 :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合 :return: """ feature_name = '{}_{}'.format(stats_name, kinds) print('computing feature {}'.format(feature_name)) # 0 读取数据 train_id, test_id, train_data, test_data, Ytrain = ReadData( Ytrain=True, sort_by_time=True) train_id['LABEL'] = Ytrain['LABEL'] train_data = train_data.merge(train_id, on=['PERSONID'], how='left') train_id = train_id.drop(['LABEL'], axis=1) # 1 个人计数 df_cat_person_count = train_data[[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('count_dict_person').reset_index() train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left') # 2 个人欺诈计数 mask = train_data['LABEL'] == 1 df_cat_person_fraud = train_data[mask][[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('fraud_dict_person').reset_index() train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left') # ---------------------------------------- 好深的bug # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0, train_id['fraud_dict_person'] = train_id[[ 'count_dict_person', 'fraud_dict_person' ]].apply(lambda x: repair_fraud_dict_person(x), axis=1) # --------------------------------------- 好深的bug # 3 所有计数 ftr51s_all = ','.join(list(train_data['FTR51'].values)) count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds) # 4 所有欺诈 ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values)) fraud_dict_all = compute_cat_count_dict_from_ftr51s( ftr51s_all_fraud, kinds) fraud_dict_all = { key: fraud_dict_all.setdefault(key, 0) for key in count_dict_all.keys() } # 5 赋值 train_id['count_dict_all'] = [ count_dict_all for _ in range(train_id.shape[0]) ] train_id['fraud_dict_all'] = [ fraud_dict_all for _ in range(train_id.shape[0]) ] # 6 oob dict train_id['count_dict_oob'] = train_id[[ 'count_dict_all', 'count_dict_person' ]].apply( lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']), axis=1) train_id['fraud_dict_oob'] = train_id[[ 'fraud_dict_all', 'fraud_dict_person' ]].apply( lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']), axis=1) count_dict_person_test = test_data[[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('count_dict_person').reset_index() test_id = test_id.merge(count_dict_person_test, on=['PERSONID'], how='left') test_id['fraud_dict_oob'] = [ fraud_dict_all for _ in range(test_id.shape[0]) ] test_id['count_dict_oob'] = [ count_dict_all for _ in range(test_id.shape[0]) ] test_id['count_dict_oob'] = test_id.apply(lambda x: { key: x['count_dict_oob'].setdefault(key, 0) for key in x['count_dict_person'].keys() }, axis=1) test_id['fraud_dict_oob'] = test_id.apply(lambda x: { key: x['fraud_dict_oob'].setdefault(key, 0) for key in x['count_dict_person'].keys() }, axis=1) # 统计计算特征 train_id[feature_name] = train_id.apply( lambda s: stats_by_oob_dict(s, stats_name), axis=1) test_id[feature_name] = test_id.apply( lambda s: stats_by_oob_dict(s, stats_name), axis=1) SaveFeature(train_id, test_id, feature_name) IsDifferentDistribution(feature_name)