def gen_stats_value_ftr51(stats_name, size='400d'): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 , 7d, 15d, 30d, 45d :return: """ feature_name = '{}_ftr51_by_{}'.format(stats_name, size) # 0 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 计算统计字典 print('1 computing stats value of ftr51 by {}'.format(size)) ftr51_stats_value_df = train_test_data[[ 'PERSONID', 'CREATETIME', 'FTR51' ]].groupby('PERSONID').apply( lambda df_person: compute_stats_value_FTR51_by_size( df_person, stats_name, size)).to_frame(feature_name).reset_index() train_test_id = train_test_id.merge(ftr51_stats_value_df, on=['PERSONID'], how='left') train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_value_ftr51("{}", "{}")'.format( stats_name, size)
def gen_stats_cost_by_non_zero_group(cost, stats_name='mean_mean', size='7d', recompute=False): """ 对诊疗次数的统计, 窗口可以是月或全局, 颗粒度天单位 :param cost: str, 项目名称 :param stats_name: str, 统计名 :param df_person: :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, 或 '1t'粒度为每次 :param recompute: bool,是否重新计算该特征 :return: """ # 1 # ['len_max', 'len_max_ratio', 'len_mean', 'len_std', 'len_count', # 'sum_max', 'sum_max_ratio', 'sum_mean', 'sum_std', # 'mean_max', 'mean_std', 'mean_mean'] feature_name = '{}_{}_by_non_zero_group_{}'.format(stats_name, cost, size) if IsAbsense(feature_name) | recompute: # 2 compute feature print('compute {}'.format(feature_name)) # 2.1 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 2.3 计算count df stats_df = train_test_data[['PERSONID', 'CREATETIME', cost]].groupby('PERSONID').apply( lambda df_person: stats_cost_by_non_zero_group(df_person, cost, stats_name, size)).to_frame(feature_name).reset_index() # 2.4 merge 拼接 train_test_id = train_test_id.merge(stats_df, on=['PERSONID'], how='left') # 2.5 保存特征 train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_cost_by_non_zero_group("{}", "{}", "{}")'.format(cost, stats_name, size) else: print('The Feature has already been computed \n') return feature_name, 'gen_stats_cost_by_non_zero_group("{}", "{}", "{}")'.format(cost, stats_name, size)
def gen_cat_vector_from_ftr51(kinds): """ 为train_test_data :param kinds: str, A, D, AB :return: """ print('compute {} vector for train_test_data'.format(kinds)) matrix_name = '{}_vector_from_ftr51'.format(kinds) # 0 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 计数字典字典 train_test_data['cat_count_dict'] = train_test_data['FTR51'].map( lambda ftr51s: compute_cat_count_dict_from_ftr51s(ftr51s, kinds)) # 对齐 v = DictVectorizer() # 计算统计向量 cat_sparse_matrix = v.fit_transform( train_test_data['cat_count_dict'].values) sparse.save_npz(get_path() + 'Data/Feature/{}.npz'.format(matrix_name), cat_sparse_matrix) return matrix_name, 'gen_cat_vector_from_ftr51("{}")'.format(kinds)
def gen_stats_count(stats_name, month='global', size='1d', non_zero=True, recompute=False): """ 对诊疗次数的统计, 窗口可以是月或全局, 颗粒度天单位 :param stats_name: str, 统计名 :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, 或 '1t'粒度为每次 :param month: str, 需要统计的时间窗口 :param non_zero: bool, 只统计非0的时间颗粒 :param recompute: bool,是否重新计算该特征 :return: """ # 1 feature_name = '{}_count_in_{}_by_{}_{}'.format(stats_name, month, size, non_zero) if IsAbsense(feature_name) | recompute: # 2 compute feature print('compute {}'.format(feature_name)) # 2.1 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) # 2.2 选择需要统计的数据 train_data, test_data = SelectDataByMonth(train_data, test_data, month) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) train_test_data['count'] = 1 # 2.3 计算count df stats_df = train_test_data[[ 'PERSONID', 'CREATETIME', 'count' ]].groupby('PERSONID').apply(lambda df_person: stats_count_by_size( df_person, stats_name, size, non_zero)).to_frame( feature_name).reset_index() # 2.4 merge 拼接 train_test_id = train_test_id.merge(stats_df, on=['PERSONID'], how='left') count_stats_fillna_by_stats_name(train_test_id, feature_name, stats_name) # 2.5 保存特征 train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_count("{}", "{}", "{}", {})'.format( stats_name, month, size, non_zero) else: print('The Feature has already been computed \n') return feature_name, 'gen_stats_count("{}", "{}", "{}", {})'.format( stats_name, month, size, non_zero)
def run(config): """ :param config: dict, 配置字典 :return: """ # 1 根据配置合并特征 Xtrain, Ytrain, Xtest = CombineFeature(config['feature_names']) # ------------------------ train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) Xtrain['PERSONID'] = train_id['PERSONID'] Ytrain['PERSONID'] = train_id['PERSONID'] Xtest['PERSONID'] = test_id['PERSONID'] Xtrain.to_csv('Xtrain_xiao.csv', index=False) Ytrain.to_csv('Ytrain_xiao.csv', index=False) Xtest.to_csv('Xtest_xiao.csv', index=False) Xtrain.drop(['PERSONID'], axis=1, inplace=True) Ytrain.drop(['PERSONID'], axis=1, inplace=True) Xtest.drop(['PERSONID'], axis=1, inplace=True) # ------------------------ # 2 根据配置初始化模型 model = InitModel(Xtrain, Ytrain, Xtest, config) # 3 线下验证 model.offline_validate() # 4 线上预测 model.online_predict() # 保存实验结果 if config['save_experiment_result']: model.save_experiment_result() # 6 返回线下验证分数以及显示预测结果 # 保存模型 for i, booster in enumerate(model.booster_offline_list): booster.save_model('xgb{}.m'.format(i)) # 连接模型预测 feature_names = list(Xtest.columns) xgb_test = xgb.DMatrix(Xtest[feature_names].values, feature_names=feature_names) submission_list = [] for i, best_iter in enumerate([161, 292, 160, 246, 269]): load_model = xgb.Booster(model_file='xgb{}.m'.format(i)) submission_list.append( load_model.predict(xgb_test, ntree_limit=best_iter)) submission = np.mean(submission_list, axis=0) print(np.sum(np.abs(model.submission_online - submission))) return model.mean_score_offline, model.submission_online, model.fold_results
def gen_stats_vector_from_cat_vector(stats_name, size, kinds): """ 为了train_test_id :param stats_name: str, 统计名字 :param size: str, 时间粒度 :param kinds: str, 类别变量种类 :return: """ # 0 读取train_test_data的cat matrix print('gen_stats_vector_from_cat_vector("{}", "{}", "{}")'.format( stats_name, size, kinds)) input_matrix_name = '{}_vector_from_ftr51'.format(kinds) input_sparse_matrix = sparse.load_npz( get_path() + 'Data/Feature/{}.npz'.format(input_matrix_name)).toarray() print('The shape of matrix is ( {}, {}) '.format( input_sparse_matrix.shape[0], input_sparse_matrix.shape[1])) # 1 读取基本数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 2 形成pd.dataframe, 便于分组统计 input_sparse_df = pd.DataFrame(data=input_sparse_matrix) print('2') del input_sparse_matrix gc.collect() input_sparse_df['PERSONID'] = train_test_data['PERSONID'] input_sparse_df['CREATETIME'] = train_test_data['CREATETIME'] # 3 开始统计 output_stats_df = input_sparse_df.groupby('PERSONID').apply( lambda df_person: compute_stats_dict_from_cat_matrix( df_person, stats_name, size)).to_frame('stats_dict').reset_index() print(3) train_test_id = train_test_id.merge(output_stats_df, on=['PERSONID'], how='left') # 4 转化成稀疏矩阵并保存 v = DictVectorizer() # 计算统计向量 stats_sparse_matrix = v.fit_transform(train_test_id['stats_dict'].values) print(4) stats_matrix_name = '{}_{}_vector_by_{}'.format(stats_name, kinds, size) sparse.save_npz( get_path() + 'Data/Feature/{}.npz'.format(stats_matrix_name), stats_sparse_matrix) return stats_matrix_name, 'gen_stats_vector_from_cat_vector("{}", "{}", "{}")'.format( stats_name, size, kinds)
def gen_rolling_stats_count(size, stats_name='sumratio2max', recompute=False): """ 对诊疗次数进行滑窗统计, 窗口可以是月或全局, 颗粒度天单位 :param stats_name: str, 统计方法 :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, :param recompute: bool,是否重新计算该特征 :return: """ # 1 feature_name = 'rolling_{}_count_{}'.format(stats_name, size) if IsAbsense(feature_name) | recompute: # 2 compute feature print('compute {}'.format(feature_name)) # 2.1 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) train_test_data['count'] = 1 # 2.3 计算count df stats_df = train_test_data[[ 'PERSONID', 'CREATETIME', 'count' ]].groupby('PERSONID').apply(lambda df_person: rolling_stats_count( df_person, stats_name, size)).to_frame(feature_name).reset_index() # 2.4 merge 拼接 train_test_id = train_test_id.merge(stats_df, on=['PERSONID'], how='left') # 2.5 保存特征 train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_rolling_stats_count("{}", "{}")'.format( size, stats_name) else: print('The Feature has already been computed \n') return feature_name, 'gen_rolling_stats_count("{}", "{}")'.format( size, stats_name)
def gen_stats_vector_ftr51(stats_name, size='7d', non_zero=False): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d :param non_zero: bool, 统计是否非0 :return: """ assert stats_name in [ 'sum', 'sum_ratio', 'max', 'max_ratio', 'mean', 'std' ] mask = (stats_name in ['sum', 'sum_ratio', 'max', 'max_ratio']) & non_zero assert not mask matrix_name = '{}_vector_ftr51_by_{}_{}'.format(stats_name, size, non_zero) # 0 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 计算统计字典 print('1 computing stats dict {}'.format(size)) ftr51_stats_dict_df = train_test_data[[ 'PERSONID', 'CREATETIME', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: stats_FTR51_by_size( df_person, stats_name, size, non_zero)).to_frame( 'stats_dict').reset_index() train_test_id = train_test_id.merge(ftr51_stats_dict_df, on=['PERSONID'], how='left') v = DictVectorizer() # 计算统计向量 print('2 computing stats vector'.format(size)) ftr51_stats_sparse_matrix = v.fit_transform( train_test_id['stats_dict'].values) joblib.dump(v, 'v_{}_{}.m'.format(stats_name, size)) sparse.save_npz(get_path() + 'Data/Feature/{}.npz'.format(matrix_name), ftr51_stats_sparse_matrix) return matrix_name, 'gen_stats_vector_ftr51("{}", "{}", {})'.format( stats_name, size, non_zero)
def gen_(month, recompute=False): # 1 feature_name = '' if IsAbsense(feature_name) | recompute: # 2 compute feature print('compute {}'.format(feature_name)) # 2.1 读取数据 train_id, test_id, train_data, test_data, Ytrain = ReadData(Ytrain=True) train_id['LABEL'] = Ytrain['LABEL'].values train_data = train_data.merge(train_id, on=['PERSONID'], how='left') # 2.2 选择需要统计的数据 train_data, test_data = SelectDataByMonth(train_data, test_data, month) # 如果本月未出现 train_id[feature_name] = train_id[feature_name].fillna(0) test_id[feature_name] = test_id[feature_name].fillna(0) # 保存特征 SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_woe_OfPerson_by_columns({}, "{}", {})'.format(feature_list, agg_name, False) else: print('The Feature has already been computed \n') return feature_name, 'gen_stats_woe_OfPerson_by_columns({}, "{}", {})'.format(feature_list, agg_name, False)
def gen_stats_value_ftr51_in_month(month='month3', stats_name='count_ratio_range'): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 , 7d, 15d, 30d, 45d :return: """ # ['nunique', 'nunique_ratio', 'len', 'count_std', 'count_max', 'count_range', 'count_ratio_std', 'count_ratio_max', 'count_ratio_range'] # pdb.set_trace() feature_name = '{}_ftr51_in_{}'.format(stats_name, month) # 0 读取数据 train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True) train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True) train_data, test_data = SelectDataByMonth(train_data, test_data, month) train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) # 计算统计字典 print('1 computing stats value of ftr51 in {}'.format(month)) ftr51_stats_value_df = train_test_data[['PERSONID', 'CREATETIME', 'FTR51']].groupby('PERSONID').apply( lambda df_person: compute_stats_value_FTR51_in_month(df_person, stats_name)).to_frame(feature_name).reset_index() train_test_id = train_test_id.merge(ftr51_stats_value_df, on=['PERSONID'], how='left') train_id[feature_name] = train_test_id[feature_name][:15000].values test_id[feature_name] = train_test_id[feature_name][15000:].values SaveFeature(train_id, test_id, feature_name) print('Finished Computing {} \n'.format(feature_name)) return feature_name, 'gen_stats_value_ftr51("{}", "{}")'.format(stats_name, month)
def gen_fraud_ratio_feature(kinds='B'): """ :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合 :return: """ # 0 读取数据 train_id, test_id, train_data, test_data, Ytrain = ReadData( Ytrain=True, sort_by_time=True) train_id['LABEL'] = Ytrain['LABEL'] train_data = train_data.merge(train_id, on=['PERSONID'], how='left') train_id = train_id.drop(['LABEL'], axis=1) # 1 个人计数 df_cat_person_count = train_data[[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('count_dict_person').reset_index() train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left') # 2 个人欺诈计数 mask = train_data['LABEL'] == 1 df_cat_person_fraud = train_data[mask][[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('fraud_dict_person').reset_index() train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left') # ---------------------------------------- 好深的bug # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0, train_id['fraud_dict_person'] = train_id[[ 'count_dict_person', 'fraud_dict_person' ]].apply(lambda x: repair_fraud_dict_person(x), axis=1) # --------------------------------------- 好深的bug # 3 所有计数 ftr51s_all = ','.join(list(train_data['FTR51'].values)) count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds) # 4 所有欺诈 ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values)) fraud_dict_all = compute_cat_count_dict_from_ftr51s( ftr51s_all_fraud, kinds) fraud_dict_all = { key: fraud_dict_all.setdefault(key, 0) for key in count_dict_all.keys() } # 5 赋值 train_id['count_dict_all'] = [ count_dict_all for _ in range(train_id.shape[0]) ] train_id['fraud_dict_all'] = [ fraud_dict_all for _ in range(train_id.shape[0]) ] # 6 oob dict train_id['count_dict_oob'] = train_id[[ 'count_dict_all', 'count_dict_person' ]].apply( lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']), axis=1) train_id['fraud_dict_oob'] = train_id[[ 'fraud_dict_all', 'fraud_dict_person' ]].apply( lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']), axis=1) # 7 cat fraud ratio dict # train train_id['cat_fraud_ratio_dict_oob'] = train_id[[ 'count_dict_oob', 'fraud_dict_oob' ]].apply(lambda s: division_dict(s['count_dict_oob'], s['fraud_dict_oob']), axis=1) # test cat_fraud_ratio_dict_all = division_dict(count_dict_all, fraud_dict_all) test_id['cat_fraud_ratio_dict_oob'] = [ cat_fraud_ratio_dict_all for _ in range(test_id.shape[0]) ] count_dict_person_test = test_data[[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('count_dict_person').reset_index() test_id = test_id.merge(count_dict_person_test, on=['PERSONID'], how='left') test_id['cat_fraud_ratio_dict_oob'] = test_id.apply(lambda x: { key: x['cat_fraud_ratio_dict_oob'].setdefault(key, 0) for key in x['count_dict_person'].keys() }, axis=1) # 利用cat的欺诈比生成个人的特征 # 8 max_fraud_ratio 特征 train_id['max_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max()) test_id['max_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max()) # 9 sum_fraud_ratio 特征 train_id['sum_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum()) test_id['sum_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum()) # 10 mean_fraud_ratio 特征 train_id['mean_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean()) test_id['mean_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map( lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean()) # 11 保存特征, 查看分布 for feat in ['max_fraud_ratio', 'sum_fraud_ratio', 'mean_fraud_ratio']: SaveFeature(train_id, test_id, feat) IsDifferentDistribution(feat)
import matplotlib.pyplot as plt import numpy as np import sys from sklearn import svm from utils.ReadData import * from utils.Calculations import * if __name__ == "__main__": if len(sys.argv) != 3: print("Format: python GaussianSVM.py TrainData TestData") exit(0) # Read Data TrainX, TrainY = ReadData(sys.argv[1]) TestX, TestY = ReadData(sys.argv[2]) # Transform Class(Digit) TrainY = TransformData(TrainY, 0) TestY = TransformData(TestY, 0) # Start Training~~ logC = [-2, -1, 0, 1, 2] TrackDis = [] for i in logC: C = 10**i GaussianSVM = svm.SVC(kernel="rbf", gamma=80, C=C) GaussianSVM.fit(TrainX, TrainY) W = TrainY[GaussianSVM.support_] * GaussianSVM.dual_coef_[0] TrackDis.append(cal_Distance(W)) # Plot the graph
from utils.Calculation import * def RidgeRegression(X, Y, Lambda = 1): W = CalPseudoinverse(X, Lambda) @ X.T @ Y return W if __name__ == "__main__": if len(sys.argv) != 3: print("Format: python RidgeRegression.py DataPath lambda") exit(0) Lambda = float(sys.argv[2]) # Read Data X, Y = ReadData(sys.argv[1]) # Cut train and test data TrainX, TestX = X[:400], X[400:] TrainY, TestY = Y[:400], Y[400:] # Run RidgeRegression W = RidgeRegression(TrainX, TrainY, Lambda) Clf = Ridge(alpha = Lambda) Clf.fit(TrainX, TrainY) # Calculate error E_in = CalError(W, TrainX, TrainY) E_out = CalError(W, TestX, TestY) print("My E_in:", E_in)
def gen_fraud_ratio_feature(kinds='E', stats_name='fraud_ratio_mean_weight'): """ 计算一个人所有的cat, 计算cat oob 的count, fraud, 例如某欺诈用户如果B1一次记录出现两次,则B1 fraud +2, count +2, 利用count, fraud 计算统计值 :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合 :return: """ feature_name = '{}_{}'.format(stats_name, kinds) print('computing feature {}'.format(feature_name)) # 0 读取数据 train_id, test_id, train_data, test_data, Ytrain = ReadData( Ytrain=True, sort_by_time=True) train_id['LABEL'] = Ytrain['LABEL'] train_data = train_data.merge(train_id, on=['PERSONID'], how='left') train_id = train_id.drop(['LABEL'], axis=1) # 1 个人计数 df_cat_person_count = train_data[[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('count_dict_person').reset_index() train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left') # 2 个人欺诈计数 mask = train_data['LABEL'] == 1 df_cat_person_fraud = train_data[mask][[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('fraud_dict_person').reset_index() train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left') # ---------------------------------------- 好深的bug # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0, train_id['fraud_dict_person'] = train_id[[ 'count_dict_person', 'fraud_dict_person' ]].apply(lambda x: repair_fraud_dict_person(x), axis=1) # --------------------------------------- 好深的bug # 3 所有计数 ftr51s_all = ','.join(list(train_data['FTR51'].values)) count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds) # 4 所有欺诈 ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values)) fraud_dict_all = compute_cat_count_dict_from_ftr51s( ftr51s_all_fraud, kinds) fraud_dict_all = { key: fraud_dict_all.setdefault(key, 0) for key in count_dict_all.keys() } # 5 赋值 train_id['count_dict_all'] = [ count_dict_all for _ in range(train_id.shape[0]) ] train_id['fraud_dict_all'] = [ fraud_dict_all for _ in range(train_id.shape[0]) ] # 6 oob dict train_id['count_dict_oob'] = train_id[[ 'count_dict_all', 'count_dict_person' ]].apply( lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']), axis=1) train_id['fraud_dict_oob'] = train_id[[ 'fraud_dict_all', 'fraud_dict_person' ]].apply( lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']), axis=1) count_dict_person_test = test_data[[ 'PERSONID', 'FTR51' ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict( df_person, kinds)).to_frame('count_dict_person').reset_index() test_id = test_id.merge(count_dict_person_test, on=['PERSONID'], how='left') test_id['fraud_dict_oob'] = [ fraud_dict_all for _ in range(test_id.shape[0]) ] test_id['count_dict_oob'] = [ count_dict_all for _ in range(test_id.shape[0]) ] test_id['count_dict_oob'] = test_id.apply(lambda x: { key: x['count_dict_oob'].setdefault(key, 0) for key in x['count_dict_person'].keys() }, axis=1) test_id['fraud_dict_oob'] = test_id.apply(lambda x: { key: x['fraud_dict_oob'].setdefault(key, 0) for key in x['count_dict_person'].keys() }, axis=1) # 统计计算特征 train_id[feature_name] = train_id.apply( lambda s: stats_by_oob_dict(s, stats_name), axis=1) test_id[feature_name] = test_id.apply( lambda s: stats_by_oob_dict(s, stats_name), axis=1) SaveFeature(train_id, test_id, feature_name) IsDifferentDistribution(feature_name)
return print(root.feature, root.theta, root.isleaf, root.label) PrintTree(root.left) PrintTree(root.right) if __name__ == "__main__": if len(sys.argv) != 3: print("Format: python CARTDecisionTree.py TrainData TestData") exit(0) # Read in data TrainFile, TestFile = sys.argv[1:3] TrainData = ReadData(TrainFile) TestData = ReadData(TestFile) # Train CART decision tree CARTDTree = CART(TrainData) # Predict train data Prediction = TreePrediction(CARTDTree, TrainData) Ein = CalculateError(TrainData, Prediction) print("Ein:", Ein) # Predict test data Prediction = TreePrediction(CARTDTree, TestData) Eout = CalculateError(TestData, Prediction) print("Eout:", Eout)
import os import numpy as np import pandas as pd import settings from utils import ReadData, Plot, Enhance, RunLengthEncoder, ImageSegment from keras.callbacks import EarlyStopping, ModelCheckpoint from skimage.transform import resize from skimage.util import random_noise import sys model_name = 'model-dsbowl-2018.h5' settings.init() data = ReadData() data.train_data() data.test_data() # get train and test data X_train = data.X_train Y_train = data.Y_train X_test = data.X_test test_sizes = data.test_sizes # enhance train data enhanced = Enhance(X_train, Y_train) enhanced.enhance() X_train = enhanced.X_out Y_train = enhanced.Y_out