def XGB_over_sample(data): # 采样前的数据 X, y = DataPreprocessing.read_X_y(data) print('原始数据:') DataTools.print_data_ratio(y) X_train, X_test, y_train, y_test = DataTools.data_split(X, y) print('分割后的测试集:') DataTools.print_data_ratio(y_test) print('分割后的训练集:') DataTools.print_data_ratio(y_train) # smote后的数据 X_os, y_os = OverSample.over_sample_own(X_train, y_train) print('上采样后的训练集:') DataTools.print_data_ratio(y_os) start_time = time.clock() y_predict = ModelXGB.xgb_predict(X_os, y_os, X_test) y_predict_prob = ModelXGB.xgb_predict_prob(X_os, y_os, X_test) end_time = time.clock() cost_time = end_time - start_time result = DataTools.compute_score_list(y_test, y_predict, y_predict_prob, cost_time) pd.DataFrame(result).to_csv('data/score/xgb_oversample.csv') print('结果已保存至score文件夹下 ^_^')
def smoteEE_own(X, y): number_records_fraud = len(y[y.Class == 1]) fraud_indices = np.array(y[y.Class == 1].index) normal_indices = y[y.Class == 0].index # 对负类进行下采样 random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False) random_normal_indices = np.array(random_normal_indices) # 负类的index + 下采样后正类的index under_ee_indices = np.concatenate([fraud_indices, random_normal_indices]) # iloc是将序列当作数组来访问,下标会从0开始 X_ee_sample = X.loc[under_ee_indices, :] y_ee_sample = y.loc[under_ee_indices, :] print('EE下采样后的训练集:') DataTools.print_data_ratio(y_ee_sample) sm = SMOTE(ratio={1: math.ceil(number_records_fraud * 1.5)}, # random_state=0, kind='regular', ) X_ee_smote_train_array, y_ee_smote_train_array = sm.fit_sample(X_ee_sample, y_ee_sample.values.ravel()) X_ee_smote_train = pd.DataFrame(X_ee_smote_train_array, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'normAmount']) y_ee_smote_train = pd.DataFrame(y_ee_smote_train_array, columns=['Class']) return X_ee_smote_train, y_ee_smote_train
def LR_origin_data(data): # 采样前的数据 X, y = DataPreprocessing.read_X_y(data) X_train, X_test, y_train, y_test = DataTools.data_split(X, y) # 训练LR,得到C参数 c_param_scores = ClassifierLR.c_param_scores(X_train, y_train) # 训练模型 y_predict = ClassifierLR.fit_model_LR(X_train, y_train, X_test, c_param_scores) # 计算混淆矩阵 cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict) # 绘制混淆矩阵图 PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix') # 绘制ROC曲线 ResultLR.LR_plot_ROC(X_train, y_train, X_test, y_test, c_param_scores)
def LR_under_sample_test(data): # 下采样后的新数据 X_under_sample, y_under_sample = UnderSample.under_sample_own(data) X_under_sample_train, X_under_sample_test, y_under_sample_train, y_under_sample_test = DataTools.data_split( X_under_sample, y_under_sample) # 训练LR,得到C参数 c_param_scores = ClassifierLR.c_param_scores(X_under_sample_train, y_under_sample_train) # 训练模型 y_predict = ClassifierLR.fit_model_LR(X_under_sample_train, y_under_sample_train, X_under_sample_test, c_param_scores) # 计算混淆矩阵 cnf_matrix = DataTools.compute_confusion_matrix( y_under_sample_test, y_predict) # 绘制混淆矩阵图 PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix') # 绘制ROC曲线 ResultLR.LR_plot_ROC(X_under_sample_train, y_under_sample_train, X_under_sample_test, y_under_sample_test, c_param_scores) # 绘制阈值图 # ResultLR.LR_plot_threshold(X_under_sample_train, y_under_sample_train, X_under_sample_test, # y_under_sample_test, c_param_scores) # 绘制精度-召回率曲线 ResultLR.LR_plot_precision_recall(X_under_sample_train, y_under_sample_train, X_under_sample_test, y_under_sample_test, c_param_scores)
def model_all_own(clf, X_train, y_train, X_test, y_test): print('*******************************************') print(clf.__class__.__name__, '开始fit...') start_time = time() clf.fit(X_train, y_train.values.ravel()) y_pred = clf.predict(X_test) y_perd_prob = clf.predict_proba(X_test) end_time = time() result = {} roc_pr = {} recall_, accuracy_, precision_, f1_, f5_, auc_, g_mean_, fpr_, tpr_ = \ DataTools.compute_score(y_test, y_pred, y_perd_prob) result['recall'] = recall_ result['acc'] = accuracy_ result['precision'] = precision_ result['f1'] = f1_ result['f5'] = f5_ result['auc'] = auc_ result['gmean'] = g_mean_ result['time'] = end_time - start_time roc_pr['fpr'] = fpr_ roc_pr['tpr'] = tpr_ print("{} 训练结束,耗时: {:.4f} ".format(clf.__class__.__name__, (end_time - start_time))) return result, roc_pr
def ee_own(X, y): number_records_fraud = len(y[y.Class == 1]) fraud_indices = np.array(y[y.Class == 1].index) normal_indices = y[y.Class == 0].index # 对负类进行下采样 random_normal_indices = np.random.choice(normal_indices, int(number_records_fraud * 1.05), replace=False) random_normal_indices = np.array(random_normal_indices) # 负类的index + 下采样后正类的index under_ee_indices = np.concatenate( [fraud_indices, random_normal_indices]) # iloc是将序列当作数组来访问,下标会从0开始 X_ee_sample = X.loc[under_ee_indices, :] y_ee_sample = y.loc[under_ee_indices, :] print('EE采样后的训练集:') DataTools.print_data_ratio(y_ee_sample) return X_ee_sample, y_ee_sample
def XGB_under_sample(data): # pandas显示 # count_class = pd.value_counts(under_sample_data['Class']).sort_index() # print('下采用后的class为:', count_class) # 采样前的数据 X, y = DataPreprocessing.read_X_y(data) print('原始数据:') DataTools.print_data_ratio(y) X_train, X_test, y_train, y_test = DataTools.data_split(X, y) print('分割后的训练集:') DataTools.print_data_ratio(y_train) # 下采样后的新数据 X_under_sample_train, y_under_sample_train = UnderSample.under_sample_own( X_train, y_train) print('下采样后的训练集:') DataTools.print_data_ratio(y_under_sample_train) # X_under_sample_train, X_under_sample_test, y_under_sample_train, y_under_sample_test = DataTools.data_split( # X_under_sample, y_under_sample) # print('下采样分割后的训练集:') # DataTools.print_data_ratio(y_under_sample_train) # xgb自带cv训练参数 # ModelXGB.xgb_cv_param(X_under_sample_train, y_under_sample_train) # 使用GridSearchCV训练参数 # ModelXGB.xgb_gridSearchCV(X_under_sample_train, y_under_sample_train) # 训练模型 start_time = time.clock() y_predict = ModelXGB.xgb_predict(X_under_sample_train, y_under_sample_train, X_test) y_predict_prob = ModelXGB.xgb_predict_prob(X_under_sample_train, y_under_sample_train, X_test) end_time = time.clock() cost_time = end_time - start_time result = DataTools.compute_score_list(y_test, y_predict, y_predict_prob, cost_time) pd.DataFrame(result).to_csv('data/score2/xgb_us2.csv') print('结果已保存至score文件夹下 ^_^') # 计算混淆矩阵 cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict) # 绘制混淆矩阵图 PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix') PlotTools.plot_roc_curve(y_test, y_predict_prob[:, 1])
def plot_thresholds(y_true, y_pred_proba): thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] plt.figure(figsize=(10, 10)) j = 1 for i in thresholds: y_predictions_high_recall = y_pred_proba[:, 1] > i plt.subplot(3, 3, j) j += 1 cnf_matrix = DataTools.compute_confusion_matrix( y_true, y_predictions_high_recall) class_names = [0, 1] PlotTools.plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s' % i)
def LR_EE_smote(data): # 子集数目 num_subsets = 10 X, y = DataPreprocessing.read_X_y(data) print('原始数据:') DataTools.print_data_ratio(y) X_train, X_test, y_train, y_test = DataTools.data_split(X, y) print('分割后的测试集:') DataTools.print_data_ratio(y_test) print('分割后的训练集:') DataTools.print_data_ratio(y_train) result = {} result_recall = [] result_acc = [] result_precision = [] result_f1 = [] result_auc = [] result_gmean = [] result_fpr_temps = [] result_tpr_temps = [] start_time = time.clock() for i in (range(num_subsets)): print( '******************************************************************************' ) print('第 ', i + 1, ' 个分类器开始:') # EE&smote后的数据 X_ee_smote, y_ee_smote = SmoteEE.smoteEE_own(X_train, y_train) DataTools.print_data_ratio(y_ee_smote) # 训练参数 # print('训练集子集%d:' % (i + 1)) # ClassifierLR.lr_grid_search_cv(X_ee_smote, y_ee_smote) pd.concat([X_ee_smote, y_ee_smote], axis=1).to_csv('data/subsets/lr_subset%d.csv' % (i + 1)) print('第%d个 子集导出成功!' % (i + 1)) print('训练集子集%d:' % (i + 1)) DataTools.print_data_ratio(y_ee_smote) y_predict = ClassifierLR.fit_model_LR(X_ee_smote, y_ee_smote, X_test, 0.1) y_predict_prob = ClassifierLR.lr_predict_proba( X_ee_smote, y_ee_smote, X_test, 0.1) recall_, accuracy_, precision_, f1_, auc_, g_mean_, fpr_, tpr_ = \ DataTools.compute_score(y_test, y_predict, y_predict_prob) result_recall.append(recall_) result_acc.append(accuracy_) result_precision.append(precision_) result_f1.append(f1_) result_auc.append(auc_) result_gmean.append(g_mean_) result_fpr_temps.append(fpr_) result_tpr_temps.append(tpr_) end_time = time.clock() result['time'] = end_time - start_time result['recall'] = np.mean(result_recall) result['acc'] = np.mean(result_acc) result['precision'] = np.mean(result_precision) result['f1'] = np.mean(result_f1) result['auc'] = np.mean(result_auc) result['gmean'] = np.mean(result_gmean) result['fpr'] = pd.DataFrame(result_fpr_temps).mean() result['tpr'] = pd.DataFrame(result_tpr_temps).mean() pd.DataFrame(result).to_csv('data/score/lr_ee_tuned.csv') print('结果已保存至score文件夹下 ^_^')
def XGB_EE(data): # 子集数目 num_subsets = 5 X, y = DataPreprocessing.read_X_y(data) X_train_tmp, X_test, y_train_tmp, y_test = DataTools.data_split(X, y) X_train, X_validate, y_train, y_validate = DataTools.data_split( X_train_tmp, y_train_tmp) result = {} result_recall = [] result_acc = [] result_precision = [] result_f1 = [] result_f5 = [] result_auc = [] result_gmean = [] result_fpr_temps = [] result_tpr_temps = [] start_time = time.clock() for i in (range(num_subsets)): print( '******************************************************************************' ) print('第 ', i + 1, ' 个分类器开始:') # EE&smote后的数据 X_ee, y_ee = EE.ee_own(X_train, y_train) pd.concat([X_ee, y_ee], axis=1).to_csv('data/subsets/subset_ee%d.csv' % (i + 1)) print('第%d个 子集导出成功!' % (i + 1)) print('训练集子集%d:' % (i + 1)) DataTools.print_data_ratio(y_ee) # 训练参数 # ModelXGB.xgb_cv_param(X_ee, y_ee) # ModelXGB.xgb_gridSearchCV(X_ee, y_ee) # return y_predict = ModelXGB.xgb_predict(X_ee, y_ee, X_test) y_predict_prob = ModelXGB.xgb_predict_prob(X_ee, y_ee, X_test) recall_, accuracy_, precision_, f1_, f5_, auc_, g_mean_, fpr_, tpr_ = \ DataTools.compute_score(y_test, y_predict, y_predict_prob) result_recall.append(recall_) result_acc.append(accuracy_) result_precision.append(precision_) result_f1.append(f1_) result_f5.append(f5_) result_auc.append(auc_) result_gmean.append(g_mean_) result_fpr_temps.append(fpr_) result_tpr_temps.append(tpr_) end_time = time.clock() result['time'] = end_time - start_time result['recall'] = np.mean(result_recall) result['acc'] = np.mean(result_acc) result['precision'] = np.mean(result_precision) result['f1'] = np.mean(result_f1) result['f5'] = np.mean(result_f5) result['auc'] = np.mean(result_auc) result['gmean'] = np.mean(result_gmean) result['fpr'] = pd.DataFrame(result_fpr_temps).mean() result['tpr'] = pd.DataFrame(result_tpr_temps).mean() pd.DataFrame(result).to_csv('data/score2/xgb_ee3.csv') print('结果已保存至score文件夹下 ^_^') # # 计算混淆矩阵 cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict) # # 绘制混淆矩阵图 PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix') PlotTools.plot_roc_curve(y_test, y_predict_prob[:, 1])
def XGB_smote(data): # 采样前的数据 X, y = DataPreprocessing.read_X_y(data) print('原始数据:') DataTools.print_data_ratio(y) X_train, X_test, y_train, y_test = DataTools.data_split(X, y) print('分割后的测试集:') DataTools.print_data_ratio(y_test) print('分割后的训练集:') DataTools.print_data_ratio(y_train) # smote后的数据 X_smote, y_smote = SmoteOrigin.smote_own(X_train, y_train) print('SMOTE后的训练集:') DataTools.print_data_ratio(y_smote) start_time = time.clock() y_predict = ModelXGB.xgb_predict(X_smote, y_smote, X_test) y_predict_prob = ModelXGB.xgb_predict_prob(X_smote, y_smote, X_test) end_time = time.clock() cost_time = end_time - start_time result = DataTools.compute_score_list(y_test, y_predict, y_predict_prob, cost_time) pd.DataFrame(result).to_csv('data/score/xgb_smote.csv') print('结果已保存至score文件夹下 ^_^') # 计算混淆矩阵 cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict) # 绘制混淆矩阵图 PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix')
def XGB_origin(data): X, y = DataPreprocessing.read_X_y(data) print('原始数据:') DataTools.print_data_ratio(y) X_train_temp, X_test, y_train_temp, y_test = DataTools.data_split(X, y) print('分割后的测试集:') DataTools.print_data_ratio(y_test) print('分割后的训练集temp:') DataTools.print_data_ratio(y_train_temp) X_train, X_validate, y_train, y_validate = DataTools.data_split( X_train_temp, y_train_temp) print('分割后的验证集:') DataTools.print_data_ratio(y_validate) print('分割后的训练集:') DataTools.print_data_ratio(y_train) # sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=666) # sss.get_n_splits(X, y) start_time = time.clock() y_predict = ModelXGB.xgb_predict(X_train, y_train, X_test) y_predict_prob = ModelXGB.xgb_predict_prob(X_train, y_train, X_test) end_time = time.clock() cost_time = end_time - start_time result = DataTools.compute_score_list(y_test, y_predict, y_predict_prob, cost_time) pd.DataFrame(result).to_csv('data/score2/xgb_origin.csv') print('结果已保存至score文件夹下 ^_^') # # 计算混淆矩阵 cnf_matrix = DataTools.compute_confusion_matrix(y_test, y_predict) # # 绘制混淆矩阵图 PlotTools.plot_confusion_matrix(cnf_matrix, title='Confusion matrix') PlotTools.plot_roc_curve(y_test, y_predict_prob[:, 1])
def result_all_model(data): X, y = DataPreprocessing.read_X_y(data) print('原始数据:') DataTools.print_data_ratio(y) X_train_temp, X_test, y_train_temp, y_test = DataTools.data_split(X, y) print('分割后的测试集:') DataTools.print_data_ratio(y_test) print('分割后的训练集temp:') DataTools.print_data_ratio(y_train_temp) X_train, X_test_validate, y_train, y_test_validate = DataTools.data_split( X_train_temp, y_train_temp) print('分割后的验证集:') DataTools.print_data_ratio(y_test_validate) print('分割后的测试集:') DataTools.print_data_ratio(y_train) # X_under_sample_train, y_under_sample_train = SmoteOrigin.smote_own(X_train, y_train) # print('smote后的测试集:') # DataTools.print_data_ratio(y_under_sample_train) # 下采样后的新数据 # X_under_sample_train, y_under_sample_train = UnderSample.under_sample_own(X_train, y_train) # print('下采样后的训练集:') # DataTools.print_data_ratio(y_under_sample_train) # ModelAll.grid_search_cv_all(X_train, y_train) # ModelXGB.xgb_gridSearchCV(X_ee, y_ee) # return X_under_sample_train, y_under_sample_train = OverSample.over_sample_own( X_train, y_train) print('上采样后的测试集:') DataTools.print_data_ratio(y_under_sample_train) clf_knn = KNeighborsClassifier() clf_lr = LogisticRegression() clf_dt = DecisionTreeClassifier() clf_aba = AdaBoostClassifier() # n_estimators=300, # learning_rate=0.28, # random_state=321) clf_gbdt = GradientBoostingClassifier() # class_weight='balanced', # max_depth=5, # criterion='entropy') clf_rf = RandomForestClassifier() # n_estimators=15, # class_weight='balanced', # max_depth=5) clf_xgb = XGBClassifier() # learning_rate=0.1, # n_estimators=70, # max_depth=4, # min_child_weight=1, # gamma=0, # objective='binary:logistic', # # subsample=0.6, # # colsample_bytree=0.4, # reg_lambda=0.1) results = {} # for clf in [clf_xgb]: for clf in [clf_dt, clf_lr, clf_aba, clf_gbdt, clf_xgb, clf_rf]: clf_name = clf.__class__.__name__ results[clf_name] = {} results[clf_name], roc_list = ModelAll.model_all_own( clf, X_under_sample_train, y_under_sample_train, X_test, y_test) # 绘制ROC曲线图 PlotTools.plot_roc_curve2(roc_list['fpr'], roc_list['tpr'], clf_name) dt_pd = pd.DataFrame(results['DecisionTreeClassifier'], index=['DT']) lr_pd = pd.DataFrame(results['LogisticRegression'], index=['LR']) ada_pd = pd.DataFrame(results['AdaBoostClassifier'], index=['ADA']) gbdt_pd = pd.DataFrame(results['GradientBoostingClassifier'], index=['GBDT']) rf_pd = pd.DataFrame(results['RandomForestClassifier'], index=['RF']) xgb_pd = pd.DataFrame(results['XGBClassifier'], index=['XGB']) all_pd = pd.concat([lr_pd, dt_pd, ada_pd, gbdt_pd, rf_pd, xgb_pd]) all_pd.to_csv('data/score3/all_models_os.csv') print('结果已保存至score文件夹下 ^_^')