def f_trainModelMain(train_path, test_path, index_name, target_name, userPath, modeltype, arithmetic): ''' 调用模型的主函数 :param train_path: 训练集 str DataFrame :param test_path: 测试集 str DataFrame :param index_name: 索引 :param target_name: 目标变量列名 :param userPath: 用户路径 :param modeltype: 模型的简称 有12个取值 [ccxboost,ccxgbm,ccxrf] + [demo,speed,accuracy,stable] :param arithmetic: [Xgboost,GBM,RF] :return: 最优模型结果的文件路径 ''' modelCode = f_genmodelCodeDict(userPath) modelmain = ModelMain(train_path, test_path, index_name, target_name) if arithmetic == 'Xgboost': return f_xgboost(modelmain, modeltype, modelCode) elif arithmetic == 'GBM': return f_gbm(modelmain, modeltype, modelCode) elif arithmetic == 'RF': return f_rf(modelmain, modeltype, modelCode) else: # 写日志 print('错误码003 模型没有跑起来') return []
def f_trainModelMain(train_path, test_path, index_name, target_name, userPath, modeltype, arithmetic,optimizationType,is_auto,param): ''' 调用模型的主函数 :param train_path: 训练集 str DataFrame :param test_path: 测试集 str DataFrame :param index_name: 索引 :param target_name: 目标变量列名 :param userPath: 用户路径 :param modeltype: 模型的简称 有24个取值 [grid,bayes] + [ccxboost,ccxgbm,ccxrf] + [demo,speed,accuracy,stable] :param arithmetic: [Xgboost,GBM,RF] :param optimizationType: [grid,bayes] :param is_auto: 是否自动寻参 :param param: 自动寻参的超参数范围 :return: 最优模型结果的文件路径 ''' modelCode = f_genmodelCodeDict(userPath) modelmain = ModelMain(train_path, test_path, index_name, target_name) #自动寻优的入口 is_auto==0 表示手动 仅仅bayes方式存在手动调参 if int(is_auto) == 0 and optimizationType == 'bayes': try: #进来了 下面就是自动寻参的天下了 manual_conf_path = create_conf(arithmetic, param,userPath) if arithmetic == 'Xgboost': return f_recursionboostModel(train_path, test_path, index_name, target_name, manual_conf_path, 0, optimizationType) elif arithmetic == 'GBM': return f_recursiongbmModel(train_path, test_path, index_name, target_name, manual_conf_path, 0, optimizationType) elif arithmetic == 'RF': return f_recursionrfModel(train_path, test_path, index_name, target_name, manual_conf_path, 0, optimizationType) else: # 写日志 print('错误码005 模型没有跑起来') return [] except Exception as e: print('错误码004 手动寻参出错问题为:') print("ErrorMsg:\t"+str(e)) return [] if arithmetic == 'Xgboost': return f_xgboost(modelmain, modeltype, modelCode,optimizationType) elif arithmetic == 'GBM': return f_gbm(modelmain, modeltype, modelCode,optimizationType) elif arithmetic == 'RF': return f_rf(modelmain, modeltype, modelCode,optimizationType) else: # 写日志 print('错误码003 模型没有跑起来') return []
def f_recursionrfModel(train_path, test_path, index_name, target_name, modelconf, i,optimizationType): ''' 递归的将上一轮的重要变量重新作为输入 从而达到筛选变量的作用 :param train_path: 训练集 :param test_path: 测试集 :param index_name: :param target_name: :param modelconf: 模型配置文件路径 :param i: 记录递归的次数 :return: 最终递归完成的模型输出 结果的路径列表 ''' train_path = ModelUtil.load_data(train_path) test_path = ModelUtil.load_data(test_path) initmodelmain = ModelMain(train_path, test_path, index_name, target_name) initpathlist = initmodelmain.ccxrf_main(modelconf,optimizationType) # 1.计算出重要变量的个数 implen, impvar = f_getImplen(initpathlist[2]) # 2.计算出模型的AUC和KS train_auc, train_ks = f_getAucKs(initpathlist[3]) test_auc, test_ks = f_getAucKs(initpathlist[4]) # 3.判断出模型重要变量占总变量的百分比情况 imppct = f_getVarpctrf(initpathlist[1], implen) # 入模变量 == 重要变量 flag = f_flag(train_auc, train_ks, test_auc, test_ks, imppct) i = i + 1 if i < 5: if flag: print('递归调用 ' * 20) newselectcol = impvar + [index_name, target_name] print('---入选模型的变量个数%s' % len(newselectcol)) train_path = ModelUtil.load_data(train_path)[newselectcol] test_path = ModelUtil.load_data(test_path)[newselectcol] print('##' * 20, i, '##' * 20) # 后续优化 递归的同时修改配置文件modelconf return f_recursionrfModel(train_path, test_path, index_name, target_name, modelconf, i,optimizationType) else: print('满足条件结束递归 ' * 10) return initpathlist else: print('递归次数达到要求结束递归' * 10) return initpathlist
r'C:\Users\liyin\Desktop\CcxFpABSModel\updateModel\FPALLVAR_1103.csv') train = pd.merge(train_index, FPALLVAR) test = pd.merge(test_index, FPALLVAR) ###### train_path = r'C:\Users\liyin\Desktop\CcxFpABSModel\updateModel\train_1103.csv' # 训练集数据路径,可见demo_data文件夹 test_path = r'C:\Users\liyin\Desktop\CcxFpABSModel\updateModel\test_1103.csv' # 测试集数据路径,可见demo_data文件夹 # train.to_csv(train_path, index=False) # test.to_csv(test_path, index=False) ############ from ccxmodel.modelmain import ModelMain index_name = 'lend_request_id' # 数据集唯一索引,有且仅支持一个索引,不支持多个索引 target_name = 'TargetBad_P12' # 目标变量 modelmain = ModelMain(train_path, test_path, index_name, target_name) modelmain.ccxboost_main( r'C:\Users\liyin\Desktop\CcxFpABSModel\updateModel\ccxboost_1.conf') r""" {'colsample_bytree': 0.80000000000000004, 'eta': 0.10000000000000001, 'eval_metric': 'auc', 'gamma': 2, 'lambda': 500, 'max_depth': 4, 'max_test_auc': 0.71066600000000002, 'max_train_auc': 0.75396439999999987, 'min_child_weight': 2, 'num_best_round': 144, 'num_boost_round': 500, 'num_maxtest_round': 494, 'objective': 'binary:logistic', 'subsample': 0.80000000000000004, 'gap': 0.042999999999999997} 模型保存成功 文件路径名:C:\Users\liyin\Desktop\CcxFpABSModel\updateModel\model20171103214623/modeltxt/model_Fp_Ccx_All_2017-11-03.txt 重要变量的个数:145 数据保存成功:C:\Users\liyin\Desktop\CcxFpABSModel\updateModel\model20171103214623/modeldata/d_2017-11-03_importance_var.csv 训练集模型报告: precision recall f1-score support 0.0 0.86 1.00 0.92 12354 1.0 0.77 0.02 0.04 2068 avg / total 0.85 0.86 0.80 14422 测试集模型报告: