Ejemplo n.º 1
0
def test_fit(preprocess_data, model_class):
    global roc_auc
    X_train, X_test, y_train, y_test = preprocess_data
    cv_pairs, (dtrain, dtest) = split_and_preprocess(X_train.copy(),
                                                     y_train,
                                                     X_test.copy(),
                                                     y_test,
                                                     cat_cols=[],
                                                     n_splits=N_CV_SPLITS)
    model = model_class(TASK_CLASSIFICATION)
    n_estimators = 10

    _dtrain = model.convert_to_dataset(dtrain.X, dtrain.y, dtrain.cat_cols)
    _dtest = model.convert_to_dataset(dtest.X, dtest.y, dtest.cat_cols)

    bst, evals_result = model.fit(params=model.default_params,
                                  dtrain=_dtrain,
                                  dtest=_dtest,
                                  n_estimators=n_estimators)
    prediction = model.predict(bst=bst, dtest=_dtest, X_test=dtest.X)

    custom_metric = {'roc_auc': roc_auc_score}
    for metric_name, metric_func in custom_metric.items():
        score = metric_func(_dtest.get_label(), prediction,
                            sample_weight=None)  # TODO weights
        roc_auc = score
    print("ROC_AUC: ", roc_auc)
    assert roc_auc <= MAX_ROC_AUC_SCORE
Ejemplo n.º 2
0
def test_convert_to_dataset(preprocess_data):
    X_train, X_test, y_train, y_test = preprocess_data
    cv_pairs, (dtrain, dtest) = split_and_preprocess(X_train.copy(), y_train,
                                                     X_test.copy(), y_test,
                                                     cat_cols=[], n_splits=N_CV_SPLITS)
    model = modelgym.XGBModel(TASK_CLASSIFICATION)
    _dtrain = model.convert_to_dataset(dtrain.X, dtrain.y, dtrain.cat_cols)
    _dtest = model.convert_to_dataset(dtest.X, dtest.y, dtest.cat_cols)
    _dexample = xgboost.DMatrix(data=dtrain.X, label=dtrain.y)
    assert _dtrain.num_row() == _dexample.num_row()
    assert _dtrain.num_col() == _dtest.num_col() == _dexample.num_col()
    assert _dtest.num_row() != _dexample.num_row()
Ejemplo n.º 3
0
def test_split_and_preprocess(read_data):
    iris_data = read_data
    X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=TEST_SIZE)
    cv_pairs, (dtrain, dtest) = split_and_preprocess(X_train.copy(), y_train,
                                                     X_test.copy(), y_test,
                                                     cat_cols=[], n_splits=N_CV_SPLITS)
    # TODO: more tests
    assert len(X_test) <= TEST_SIZE * len(iris_data.data)
    assert len(X_train) <= TRAIN_SIZE * len(iris_data.data)
    assert len(y_test) <= TEST_SIZE * len(iris_data.data)
    assert len(y_train) <= TRAIN_SIZE * len(iris_data.data)
    assert (len(X_train) == len(y_train))
    assert (len(X_test) == len(y_test))
Ejemplo n.º 4
0
def test_crossval_fit_eval(preprocess_data, trainer_class, model_class):
    global roc_auc, res
    X_train, X_test, y_train, y_test = preprocess_data
    cv_pairs, (dtrain, dtest) = split_and_preprocess(X_train.copy(),
                                                     y_train,
                                                     X_test.copy(),
                                                     y_test,
                                                     cat_cols=[],
                                                     n_splits=N_CV_SPLITS)

    trainer = trainer_class(opt_evals=N_PROBES, n_estimators=N_ESTIMATORS)
    model = model_class(TASK_CLASSIFICATION)
    flag = False
    for i in range(0, 2):
        if (flag):
            res = trainer.crossval_fit_eval(model=model,
                                            cv_pairs=cv_pairs,
                                            n_estimators=N_ESTIMATORS,
                                            params=model.default_params)
            flag = True
        else:
            res = trainer.crossval_fit_eval(model=model,
                                            cv_pairs=cv_pairs,
                                            n_estimators=N_ESTIMATORS)
    if isinstance(res, dict):
        loss = res['loss']
        params = res['params']
        params = model.preprocess_params(params)
    else:
        loss = res
        params = model.default_params
    assert loss <= MAX_ROC_AUC_SCORE
    n_estimators = N_ESTIMATORS

    _dtrain = model.convert_to_dataset(dtrain.X, dtrain.y, dtrain.cat_cols)
    _dtest = model.convert_to_dataset(dtest.X, dtest.y, dtest.cat_cols)

    print("FITTING BEST PARAMS")
    bst, evals_result = model.fit(params=params,
                                  dtrain=_dtrain,
                                  dtest=_dtest,
                                  n_estimators=n_estimators)
    prediction = model.predict(bst=bst, dtest=_dtest, X_test=dtest.X)

    custom_metric = {'roc_auc': roc_auc_score}
    for metric_name, metric_func in custom_metric.items():
        score = metric_func(_dtest.get_label(), prediction,
                            sample_weight=None)  # TODO weights
        roc_auc = score
    print("ROC_AUC: ", roc_auc)
    assert roc_auc <= MAX_ROC_AUC_SCORE
Ejemplo n.º 5
0
def test_predict(preprocess_data):
    X_train, X_test, y_train, y_test = preprocess_data
    cv_pairs, (dtrain, dtest) = split_and_preprocess(X_train.copy(), y_train,
                                                     X_test.copy(), y_test,
                                                     cat_cols=[], n_splits=N_CV_SPLITS)
    model = modelgym.XGBModel(TASK_CLASSIFICATION)
    trainer = Trainer(opt_evals=N_PROBES, n_estimators=N_ESTIMATORS)

    res = trainer.crossval_fit_eval(model, cv_pairs)
    ans = trainer.fit_eval(model, dtrain, dtest, res['params'], res['best_n_estimators'],
                           custom_metric={'roc_auc': roc_auc_score})
    roc_auc = ans['roc_auc']
    print("ROC_AUC: ", roc_auc)
    assert roc_auc <= MAX_ROC_AUC_SCORE
Ejemplo n.º 6
0
def test_crossval_optimize_params(preprocess_data, trainer_class, model_class):
    global roc_auc
    X_train, X_test, y_train, y_test = preprocess_data
    cv_pairs, (dtrain, dtest) = split_and_preprocess(X_train.copy(),
                                                     y_train,
                                                     X_test.copy(),
                                                     y_test,
                                                     cat_cols=[],
                                                     n_splits=N_CV_SPLITS)

    model = model_class(TASK_CLASSIFICATION)
    trainer = trainer_class(opt_evals=N_PROBES, n_estimators=N_ESTIMATORS)

    _dtrain = model.convert_to_dataset(dtrain.X, dtrain.y, dtrain.cat_cols)
    _dtest = model.convert_to_dataset(dtest.X, dtest.y, dtest.cat_cols)

    # trainer.fit_eval(model=model, dtrain=dtrain, dtest=dtest)
    optimized = trainer.crossval_optimize_params(model=model,
                                                 cv_pairs=cv_pairs)

    optimized.pop('loss')
    params = optimized['params']
    params = model.preprocess_params(params)
    n_estimators = N_ESTIMATORS

    bst, evals_result = model.fit(params=params,
                                  dtrain=_dtrain,
                                  dtest=_dtest,
                                  n_estimators=n_estimators)
    prediction = model.predict(bst=bst, dtest=_dtest, X_test=dtest.X)

    custom_metric = {'roc_auc': roc_auc_score}
    for metric_name, metric_func in custom_metric.items():
        score = metric_func(_dtest.get_label(), prediction,
                            sample_weight=None)  # TODO weights
        roc_auc = score
    print("ROC_AUC: ", roc_auc)

    assert roc_auc <= MAX_ROC_AUC_SCORE