def TRAINXGB(X, Y, argsDict, name, save_path): logger = log.init_log() max_depth = argsDict["max_depth"] n_estimators = argsDict['n_estimators'] learning_rate = argsDict["learning_rate"] subsample = argsDict["subsample"] min_child_weight = argsDict["min_child_weight"] reg_alpha = argsDict["reg_alpha"] reg_lambda = argsDict["reg_lambda"] colsample_bytree = argsDict["colsample_bytree"] gbm = XGBClassifier( tree_method='gpu_hist', max_bin=800, objective="binary:logistic", n_jobs=16, max_depth=max_depth, #最大深度 n_estimators=n_estimators, #树的数量 learning_rate=learning_rate, #学习率 subsample=subsample, #采样数 min_child_weight=min_child_weight, #孩子数 max_delta_step=10, #10步不降则停止 reg_alpha=reg_alpha, reg_lambda=reg_lambda, colsample_bytree=colsample_bytree, ) kfold = StratifiedKFold(n_splits=5, random_state=42) metric = cross_val_score(gbm, X, Y, cv=kfold, scoring="roc_auc").mean() logger.info(f"位点: {name} xgb最终得分: {metric}") print(f"位点: {name} xgb最终得分: {metric}") gbm.fit(X, Y) tail = '.' + str(int(round(metric, 5) * 100000)) joblib.dump(gbm, save_path + tail)
def gridsearch(X,Y, model, _file, save_path, n_splits=5): logger = log.init_log() ret = [] skf = StratifiedKFold(n_splits=n_splits) for train_index, test_index in skf.split(X, Y): X_train, X_test = X[train_index,:], X[test_index,:] Y_train, Y_test = Y[train_index], Y[test_index] model.fit(X_train, Y_train, epochs=30, batch_size=64) score = roc_auc_score(Y_test, model.predict(X_test)) print(model.predict(X_test)) ret.append(score) logger.info(_file + ": " + str(score)) print(_file + ": " + str(score)) score = np.mean(ret) logger.info(_file + "平均auc: " + str(score)) logger.info("开始训练全量模型" + str(score)) print(_file + "平均auc: " + str(score)) model.fit(X, Y, epochs=30, batch_size=64) model.save(save_path)
def LGB(argsDict): num_leaves = argsDict["num_leaves"] + 25 max_depth = argsDict["max_depth"] learning_rate = argsDict["learning_rate"] * 0.02 + 0.05 n_estimators = argsDict['n_estimators'] * 10 + 50 min_child_weight = argsDict['min_child_weight'] min_child_samples = argsDict['min_child_samples'] + 18 subsample = argsDict["subsample"] * 0.1 + 0.7 colsample_bytree = argsDict["colsample_bytree"] reg_alpha = argsDict["reg_alpha"] reg_lambda = argsDict["reg_lambda"] path = argsDict['path'] data = np.load(path) data = data.astype('float32') data[data == 2] = 0.5 X, Y = data[:, :-1], data[:, -1] _, rsid, _, _ = tool.splitPath(path) gbm = LGBMClassifier(device='gpu', gpu_platform_id=0, gpu_device_id=0, max_bin=255, num_leaves=num_leaves, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, min_child_weight=min_child_weight, min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, reg_lambda=reg_lambda, n_jobs=1) # kfold = StratifiedKFold(n_splits=5, random_state=42) kfold = StratifiedKFold(n_splits=5) metric = cross_val_score(gbm, X, Y, cv=kfold, scoring="roc_auc").mean() logger = log.init_log() logger.info(f"{rsid} 的训练得分为: {metric}") print(f"{rsid} 的训练得分为: {metric}") return -metric
def XGB(argsDict): max_depth = argsDict["max_depth"] + 1 n_estimators = argsDict['n_estimators'] * 10 + 50 learning_rate = argsDict["learning_rate"] * 0.02 + 0.05 subsample = argsDict["subsample"] * 0.1 + 0.7 min_child_weight = argsDict["min_child_weight"] + 1 reg_alpha = argsDict["reg_alpha"] reg_lambda = argsDict["reg_lambda"] colsample_bytree = argsDict["colsample_bytree"] path = argsDict['path'] data = np.load(path) data = data.astype('float32') data[data == 2] = 0.5 X, Y = data[:, :-1], data[:, -1] _, rsid, _, _ = tool.splitPath(path) gbm = XGBClassifier( tree_method='gpu_hist', max_bin=255, objective="binary:logistic", max_depth=max_depth, #最大深度 n_estimators=n_estimators, #树的数量 learning_rate=learning_rate, #学习率 subsample=subsample, #采样数 min_child_weight=min_child_weight, #孩子数 max_delta_step=10, #10步不降则停止 reg_alpha=reg_alpha, reg_lambda=reg_lambda, colsample_bytree=colsample_bytree, ) kfold = StratifiedKFold(n_splits=5, random_state=42) metric = cross_val_score(gbm, X, Y, cv=kfold, scoring="roc_auc").mean() logger = log.init_log() logger.info(f"{rsid} xgb的训练得分为: {metric}") print(f"{rsid} xgb的训练得分为: {metric}") return -metric
logger.info(_file + ": " + str(score)) print(_file + ": " + str(score)) score = np.mean(ret) logger.info(_file + "平均auc: " + str(score)) logger.info("开始训练全量模型" + str(score)) print(_file + "平均auc: " + str(score)) model.fit(X, Y, epochs=30, batch_size=64) model.save(save_path) path = '/root/jiapeiling/v2-v2data/' save_path = '/root/jiapeiling/v2-v2model/' files = [path+each for each in os.listdir(path) if '.csv' in each] log.alter_log_ini() logger = log.init_log() for _file in files: print(_file) file_name = _file.split('/')[-1].split('.')[0] df = pd.read_csv(_file) print(df.shape) X, Y = df.iloc[:, 1:-1].values, df.iloc[:, -1].values input_dim = X.shape[1] print(X.shape[1]) star = time.time() model = create_model() gridsearch(X, Y, model, _file, save_path + file_name + '.h5') end = time.time() logger.info('用时为: ' + str(end - star))
def pipeline(path): logger = log.init_log() max_evals = 50 _, name, _, _ = tool.splitPath(path) logger.info(f'xgb开始训练位点: {name}') print(f'xgb开始训练位点: {name}') data = np.load(path) try: X, Y = data[:, :-1], data[:, -1] except: logger.info(f'位点: {name} 文件读取错误') print(f'位点: {name} 文件读取错误') return 0 if len(np.unique(Y)) == 1: logger.info(f'位点: {name} 只有一种类标签') print(f'位点: {name} 只有一种类标签') return 0 tmp = Y.tolist() tmp = dict(Counter(tmp)) if tmp[0] > tmp[1]: ma, mi = tmp[0], tmp[1] else: ma, mi = tmp[1], tmp[0] if mi / ma < 0.01: logger.info(f'位点: {name} 为低频位点') print(f'位点: {name} 为低频位点') return 0 space = { "max_depth": hp.randint("max_depth", 15), # [0, upper) "n_estimators": hp.randint("n_estimators", 5), # [0,1000) "learning_rate": hp.uniform("learning_rate", 0.001, 2), # 0.001-2均匀分布 "min_child_weight": hp.randint("min_child_weight", 5), "subsample": hp.randint("subsample", 4), "reg_alpha": hp.choice("reg_alpha", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1]), "reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]), "colsample_bytree": hp.choice("colsample_bytree", [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]), "path": hp.choice('path', [path]) } star = time.time() algo = partial(tpe.suggest, n_startup_jobs=1) # 优化算法种类 best = fmin(XGB, space, algo=algo, max_evals=max_evals) # max_evals表示想要训练的最大模型数量,越大越容易找到最优解 best = RECOVERXGB(best) print(best) TRAINXGB(X, Y, best, name, save_path + name + '.xgb') end = time.time() times = end - star logger.info(f'位点: {name} xgb用时为: {times}')