def putFile(self, file_path, remote_path, iscut=False): logger = None if self.log: logger = log.init_log(self.log_path) scp = SCPClient(self.ssh.get_transport()) if not os.path.exists(file_path): print(f'{file_path} 文件不存在') if self.log: logger.info(f'{file_path} 文件不存在') _, _, _, file_name = tool.splitPath(file_path) _, _, _, remote_name = tool.splitPath(remote_path) # 可以只是文件夹路径 if remote_name != file_name: remote_path = tool.pathJoin(remote_path, file_name) scp.put(file_path, remote_path) scp.close() if iscut: os.remove(file_path) print(f'{file_path} -----> {self.ip_name}{remote_path}') if self.log: logger.info(f'{file_path} -----> {self.ip_name}{remote_path}')
def cutAllFiles(srcfile, dstfile, key=None, isreplace=False): if key is None: files = tool.getFiles(srcfile) else: files = tool.getFiles(srcfile, key=key) if isreplace: for _file in files: if isFile(_file): _, _, _, name = tool.splitPath(_file) if isExist(_file): delFile(tool.pathJoin(dstfile, name)) tool.cutFile(_file, dstfile) else: _, _, _, name = tool.splitPath(_file) if isExist(_file): delDir(tool.pathJoin(dstfile, name)) shutil.move(_file, dstfile + f'/{name}') print(f'copy {_file} -> dstfile/{name}') else: for _file in files: if isFile(_file): tool.cutFile(_file, dstfile) else: _, _, _, name = tool.splitPath(_file) shutil.move(_file, dstfile + f'/{name}') print(f'copy {_file} -> dstfile/{name}')
def getFile(self, file_path, remote_path): logger = None if self.log: logger = log.init_log(self.log_path) scp = SCPClient(self.ssh.get_transport()) _, _, _, file_name = tool.splitPath(file_path) _, _, _, remote_name = tool.splitPath(remote_path) # 可以只是文件夹路径 if remote_name != file_name: remote_path = tool.pathJoin(remote_path, file_name) scp.get(file_path, remote_path) scp.close() print(f'{self.ip_name}{file_path} -----> {remote_path}') if self.log: logger.info(f'{self.ip_name}{file_path} -----> {remote_path}')
def LGB(argsDict): num_leaves = argsDict["num_leaves"] + 25 max_depth = argsDict["max_depth"] learning_rate = argsDict["learning_rate"] * 0.02 + 0.05 n_estimators = argsDict['n_estimators'] * 10 + 50 min_child_weight = argsDict['min_child_weight'] min_child_samples = argsDict['min_child_samples'] + 18 subsample = argsDict["subsample"] * 0.1 + 0.7 colsample_bytree = argsDict["colsample_bytree"] reg_alpha = argsDict["reg_alpha"] reg_lambda = argsDict["reg_lambda"] path = argsDict['path'] data = np.load(path) data = data.astype('float32') data[data == 2] = 0.5 X, Y = data[:, :-1], data[:, -1] _, rsid, _, _ = tool.splitPath(path) gbm = LGBMClassifier(device='gpu', gpu_platform_id=0, gpu_device_id=0, max_bin=255, num_leaves=num_leaves, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, min_child_weight=min_child_weight, min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, reg_lambda=reg_lambda, n_jobs=1) # kfold = StratifiedKFold(n_splits=5, random_state=42) kfold = StratifiedKFold(n_splits=5) metric = cross_val_score(gbm, X, Y, cv=kfold, scoring="roc_auc").mean() logger = log.init_log() logger.info(f"{rsid} 的训练得分为: {metric}") print(f"{rsid} 的训练得分为: {metric}") return -metric
def XGB(argsDict): max_depth = argsDict["max_depth"] + 1 n_estimators = argsDict['n_estimators'] * 10 + 50 learning_rate = argsDict["learning_rate"] * 0.02 + 0.05 subsample = argsDict["subsample"] * 0.1 + 0.7 min_child_weight = argsDict["min_child_weight"] + 1 reg_alpha = argsDict["reg_alpha"] reg_lambda = argsDict["reg_lambda"] colsample_bytree = argsDict["colsample_bytree"] path = argsDict['path'] data = np.load(path) data = data.astype('float32') data[data == 2] = 0.5 X, Y = data[:, :-1], data[:, -1] _, rsid, _, _ = tool.splitPath(path) gbm = XGBClassifier( tree_method='gpu_hist', max_bin=255, objective="binary:logistic", max_depth=max_depth, #最大深度 n_estimators=n_estimators, #树的数量 learning_rate=learning_rate, #学习率 subsample=subsample, #采样数 min_child_weight=min_child_weight, #孩子数 max_delta_step=10, #10步不降则停止 reg_alpha=reg_alpha, reg_lambda=reg_lambda, colsample_bytree=colsample_bytree, ) kfold = StratifiedKFold(n_splits=5, random_state=42) metric = cross_val_score(gbm, X, Y, cv=kfold, scoring="roc_auc").mean() logger = log.init_log() logger.info(f"{rsid} xgb的训练得分为: {metric}") print(f"{rsid} xgb的训练得分为: {metric}") return -metric
def pipeline(path): max_evals = 30 _, name, _, _ = tool.splitPath(path) logger.info(f'开始训练位点: {name}') print(f'开始训练位点: {name}') data = np.load(path) try: X, Y = data[:, :-1], data[:, -1] except: logger.info(f'位点: {name} 文件读取错误') print(f'位点: {name} 文件读取错误') return 0 if len(np.unique(Y)) == 1: logger.info(f'位点: {name} 只有一种类标签') print(f'位点: {name} 只有一种类标签') return 0 tmp = Y.tolist() tmp = dict(Counter(tmp)) if tmp[0] > tmp[1]: ma, mi = tmp[0], tmp[1] else: ma, mi = tmp[1], tmp[0] if mi / ma < 0.01: logger.info(f'位点: {name} 为低频位点') print(f'位点: {name} 为低频位点') return 0 space = { "num_leaves": hp.randint("num_leaves", 5), # [0, upper) "max_depth": hp.choice("max_depth", [-1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]), "learning_rate": hp.uniform("learning_rate", 0.001, 2), # 0.001-2均匀分布 "n_estimators": hp.randint("n_estimators", 5), # [0,1000) "min_child_weight": hp.uniform("min_child_weight", 0.001, 0.01), # 0.001-2均匀分布 "min_child_samples": hp.randint("min_child_samples", 10), # [0,1000) "subsample": hp.randint("subsample", 4), "colsample_bytree": hp.choice("colsample_bytree", [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]), "reg_alpha": hp.choice("reg_alpha", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1]), "reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]), "path": hp.choice('path', [path]) } star = time.time() algo = partial(tpe.suggest, n_startup_jobs=1) # 优化算法种类 best = fmin(LGB, space, algo=algo, max_evals=max_evals) # max_evals表示想要训练的最大模型数量,越大越容易找到最优解 best = RECOVERLGB(best) TRAINLGB(X, Y, best, name, save_path + name + '.lgb', logger) end = time.time() times = end - star logger.info(f'位点: {name} 用时为: {times}')