Beispiel #1
0
def TRAINXGB(X, Y, argsDict, name, save_path):
    logger = log.init_log()
    max_depth = argsDict["max_depth"]
    n_estimators = argsDict['n_estimators']
    learning_rate = argsDict["learning_rate"]
    subsample = argsDict["subsample"]
    min_child_weight = argsDict["min_child_weight"]
    reg_alpha = argsDict["reg_alpha"]
    reg_lambda = argsDict["reg_lambda"]
    colsample_bytree = argsDict["colsample_bytree"]
    gbm = XGBClassifier(
        tree_method='gpu_hist',
        max_bin=800,
        objective="binary:logistic",
        n_jobs=16,
        max_depth=max_depth,  #最大深度
        n_estimators=n_estimators,  #树的数量
        learning_rate=learning_rate,  #学习率
        subsample=subsample,  #采样数
        min_child_weight=min_child_weight,  #孩子数
        max_delta_step=10,  #10步不降则停止
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        colsample_bytree=colsample_bytree,
    )
    kfold = StratifiedKFold(n_splits=5, random_state=42)
    metric = cross_val_score(gbm, X, Y, cv=kfold, scoring="roc_auc").mean()

    logger.info(f"位点: {name} xgb最终得分: {metric}")
    print(f"位点: {name} xgb最终得分: {metric}")
    gbm.fit(X, Y)
    tail = '.' + str(int(round(metric, 5) * 100000))
    joblib.dump(gbm, save_path + tail)
Beispiel #2
0
def gridsearch(X,Y, model, _file, save_path, n_splits=5):
    logger = log.init_log()
    ret = []
    skf = StratifiedKFold(n_splits=n_splits)
    for train_index, test_index in skf.split(X, Y):
        X_train, X_test = X[train_index,:], X[test_index,:]
        Y_train, Y_test = Y[train_index], Y[test_index]

        model.fit(X_train, Y_train, epochs=30, batch_size=64)
        score = roc_auc_score(Y_test, model.predict(X_test))
        print(model.predict(X_test))
        ret.append(score)
        logger.info(_file + ": " + str(score))
        print(_file + ": " + str(score))
    score = np.mean(ret)
    logger.info(_file + "平均auc: " + str(score))
    logger.info("开始训练全量模型" + str(score))
    print(_file + "平均auc: " + str(score))
    model.fit(X, Y, epochs=30, batch_size=64)
    model.save(save_path)
Beispiel #3
0
def LGB(argsDict):
    num_leaves = argsDict["num_leaves"] + 25
    max_depth = argsDict["max_depth"]
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    n_estimators = argsDict['n_estimators'] * 10 + 50
    min_child_weight = argsDict['min_child_weight']
    min_child_samples = argsDict['min_child_samples'] + 18
    subsample = argsDict["subsample"] * 0.1 + 0.7
    colsample_bytree = argsDict["colsample_bytree"]
    reg_alpha = argsDict["reg_alpha"]
    reg_lambda = argsDict["reg_lambda"]
    path = argsDict['path']
    data = np.load(path)
    data = data.astype('float32')
    data[data == 2] = 0.5
    X, Y = data[:, :-1], data[:, -1]
    _, rsid, _, _ = tool.splitPath(path)
    gbm = LGBMClassifier(device='gpu',
                         gpu_platform_id=0,
                         gpu_device_id=0,
                         max_bin=255,
                         num_leaves=num_leaves,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         n_estimators=n_estimators,
                         min_child_weight=min_child_weight,
                         min_child_samples=min_child_samples,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree,
                         reg_alpha=reg_alpha,
                         reg_lambda=reg_lambda,
                         n_jobs=1)
    # kfold = StratifiedKFold(n_splits=5, random_state=42)
    kfold = StratifiedKFold(n_splits=5)
    metric = cross_val_score(gbm, X, Y, cv=kfold, scoring="roc_auc").mean()
    logger = log.init_log()
    logger.info(f"{rsid} 的训练得分为: {metric}")
    print(f"{rsid} 的训练得分为: {metric}")
    return -metric
Beispiel #4
0
def XGB(argsDict):
    max_depth = argsDict["max_depth"] + 1
    n_estimators = argsDict['n_estimators'] * 10 + 50
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.7
    min_child_weight = argsDict["min_child_weight"] + 1
    reg_alpha = argsDict["reg_alpha"]
    reg_lambda = argsDict["reg_lambda"]
    colsample_bytree = argsDict["colsample_bytree"]

    path = argsDict['path']
    data = np.load(path)
    data = data.astype('float32')
    data[data == 2] = 0.5
    X, Y = data[:, :-1], data[:, -1]
    _, rsid, _, _ = tool.splitPath(path)

    gbm = XGBClassifier(
        tree_method='gpu_hist',
        max_bin=255,
        objective="binary:logistic",
        max_depth=max_depth,  #最大深度
        n_estimators=n_estimators,  #树的数量
        learning_rate=learning_rate,  #学习率
        subsample=subsample,  #采样数
        min_child_weight=min_child_weight,  #孩子数
        max_delta_step=10,  #10步不降则停止
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        colsample_bytree=colsample_bytree,
    )
    kfold = StratifiedKFold(n_splits=5, random_state=42)
    metric = cross_val_score(gbm, X, Y, cv=kfold, scoring="roc_auc").mean()

    logger = log.init_log()
    logger.info(f"{rsid} xgb的训练得分为: {metric}")
    print(f"{rsid} xgb的训练得分为: {metric}")
    return -metric
Beispiel #5
0
        logger.info(_file + ": " + str(score))
        print(_file + ": " + str(score))
    score = np.mean(ret)
    logger.info(_file + "平均auc: " + str(score))
    logger.info("开始训练全量模型" + str(score))
    print(_file + "平均auc: " + str(score))
    model.fit(X, Y, epochs=30, batch_size=64)
    model.save(save_path)



path = '/root/jiapeiling/v2-v2data/'
save_path = '/root/jiapeiling/v2-v2model/'
files = [path+each for each in os.listdir(path) if '.csv' in each]
log.alter_log_ini()
logger = log.init_log()
for _file in files:
    print(_file)
    file_name = _file.split('/')[-1].split('.')[0]
    df = pd.read_csv(_file)
    print(df.shape)
    X, Y = df.iloc[:, 1:-1].values, df.iloc[:, -1].values
    input_dim = X.shape[1]
    print(X.shape[1])
    star = time.time()
    model = create_model()
    gridsearch(X, Y, model, _file, save_path + file_name + '.h5')
    end = time.time()
    logger.info('用时为: ' + str(end - star))
    
Beispiel #6
0
def pipeline(path):
    logger = log.init_log()
    max_evals = 50

    _, name, _, _ = tool.splitPath(path)
    logger.info(f'xgb开始训练位点: {name}')
    print(f'xgb开始训练位点: {name}')

    data = np.load(path)

    try:
        X, Y = data[:, :-1], data[:, -1]
    except:
        logger.info(f'位点: {name} 文件读取错误')
        print(f'位点: {name} 文件读取错误')
        return 0

    if len(np.unique(Y)) == 1:
        logger.info(f'位点: {name} 只有一种类标签')
        print(f'位点: {name} 只有一种类标签')
        return 0

    tmp = Y.tolist()
    tmp = dict(Counter(tmp))
    if tmp[0] > tmp[1]:
        ma, mi = tmp[0], tmp[1]
    else:
        ma, mi = tmp[1], tmp[0]
    if mi / ma < 0.01:
        logger.info(f'位点: {name} 为低频位点')
        print(f'位点: {name} 为低频位点')
        return 0

    space = {
        "max_depth":
        hp.randint("max_depth", 15),  # [0, upper)
        "n_estimators":
        hp.randint("n_estimators", 5),  # [0,1000)
        "learning_rate":
        hp.uniform("learning_rate", 0.001, 2),  # 0.001-2均匀分布
        "min_child_weight":
        hp.randint("min_child_weight", 5),
        "subsample":
        hp.randint("subsample", 4),
        "reg_alpha":
        hp.choice("reg_alpha", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1]),
        "reg_lambda":
        hp.choice("reg_lambda", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]),
        "colsample_bytree":
        hp.choice("colsample_bytree", [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]),
        "path":
        hp.choice('path', [path])
    }

    star = time.time()
    algo = partial(tpe.suggest, n_startup_jobs=1)  # 优化算法种类
    best = fmin(XGB, space, algo=algo,
                max_evals=max_evals)  # max_evals表示想要训练的最大模型数量,越大越容易找到最优解

    best = RECOVERXGB(best)
    print(best)
    TRAINXGB(X, Y, best, name, save_path + name + '.xgb')
    end = time.time()
    times = end - star
    logger.info(f'位点: {name} xgb用时为: {times}')