def train_h2o(X: pd.DataFrame, y: pd.Series, config: Config):

    h2o.init()

    X["target"] = y
    train = h2o.H2OFrame(X)
    train_x = train.columns
    train_y = "target"
    train_x.remove(train_y)

    if config["mode"] == "classification":
        train[train_y] = train[train_y].asfactor()

    aml = H2OAutoML(max_runtime_secs=int(config.time_left() * 0.9),
                    max_models=20,
                    nfolds=3,
                    exclude_algos=["GBM", "DeepLearning", "DRF"],
                    seed=42)

    aml.train(x=train_x, y=train_y, training_frame=train)

    config['params']['pipeline'][config["stage"]]["model"] = h2o.save_model(
        model=aml.leader, path=config.model_dir + "/h2o.model", force=True)
    if config.verbose: print(aml.leaderboard)

    X.drop("target", axis=1, inplace=True)
Exemple #2
0
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config):
    params = {
        "objective": "regression" if config.is_regression() else "binary",
        "metric": "rmse" if config.is_regression() else "auc",
        "verbosity": -1,
        "seed": 1,
    }

    X_sample, y_sample = data_sample(X, y, config, nrows=20000)
    hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config)

    X_train, X_val, y_train, y_val = data_split(X, y, config)

    config["model"] = lgb.train(
        {**params, **hyperparams},
        lgb.Dataset(X_train, label=y_train),
        5000,
        lgb.Dataset(X_val, label=y_val),
        early_stopping_rounds=100,
        verbose_eval=100,
    )
    config.save()

    try:
        with time_limit(config.time_left() - 10):
            config["model"] = lgb.train(
                {**params, **hyperparams},
                lgb.Dataset(X, label=y),
                int(1.2 * config["model"].best_iteration),
            )
    except TimeoutException:
        Log.print("Timed out!")
Exemple #3
0
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config):
    params = {
        "objective":
        "regression" if config["mode"] == "regression" else "binary",
        "metric": "rmse" if config["mode"] == "regression" else "auc",
        "verbosity": -1,
        "seed": 1,
    }

    X_sample, y_sample = data_sample(X, y)
    hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config)

    for i in range(1):
        print(
            '################################################################## cv '
            + str(i))
        t1_bagging = time.time()
        params['seed'] = i + 1
        # cv
        nfold = 5
        if config["mode"] == 'classification':
            skf = StratifiedKFold(n_splits=nfold,
                                  shuffle=True,
                                  random_state=777)
        else:
            skf = KFold(n_splits=nfold, shuffle=True, random_state=777)
        skf_split = skf.split(X, y)

        log('####################################################################### begin cv'
            )
        log('####### cur time = ' +
            str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
        score_list = []
        config["model"] = []
        for fid, (train_idx, valid_idx) in enumerate(skf_split):
            t1_cv = time.time()
            print("FoldID:{}".format(fid))
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
            dtrain = lgb.Dataset(X_train, label=y_train)
            dvalid = lgb.Dataset(X_valid, label=y_valid, reference=dtrain)

            cur_model = lgb.train({
                **params,
                **hyperparams
            },
                                  dtrain,
                                  3000,
                                  dvalid,
                                  early_stopping_rounds=50,
                                  verbose_eval=100)
            config["model"].append(cur_model)

            score_list.append(cur_model.best_score)
            # gc.collect()
            sys.stdout.flush()
            t2_cv = time.time()
            time_left = config.time_left()
            print('######### cv' + str(time_left))
            if (t2_cv - t1_cv) * (nfold - fid + 1) >= time_left:
                pass
                #break

        log('######################################################################### end cv'
            )
        log('####### cur time = ' +
            str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")))

        valid_auc = np.array(
            [i['valid_0'][params['metric']] for i in score_list])
        print('valid', valid_auc, np.mean(valid_auc))
        cv_score = pd.DataFrame(
            {'cv': np.hstack([valid_auc, np.mean(valid_auc)])})
        path = config['path_pred']
        print(path)
        cv_score.to_csv(path + '/cv_score_' + str(i) + '.csv', index=False)

        t2_bagging = time.time()
        time_left = config.time_left()
        print('#########bagging' + str(time_left))
        if (t2_bagging - t1_bagging) * 1.5 >= time_left:
            #break
            pass