Example #1
0
def test_experiment_fit_params():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8, 'n_estimators': 500}

    with get_temp_directory() as temp_path:
        result1 = experiment_gbdt(params,
                                  X_train,
                                  y_train,
                                  X_test,
                                  temp_path,
                                  fit_params={'early_stopping_rounds': None})
    with get_temp_directory() as temp_path:
        result2 = experiment_gbdt(params,
                                  X_train,
                                  y_train,
                                  X_test,
                                  temp_path,
                                  fit_params={'early_stopping_rounds': 5})

    assert result1.models[-1].booster_.num_trees() == params['n_estimators']
    assert result2.models[-1].booster_.num_trees() < params['n_estimators']
Example #2
0
def test_with_feature_attachment():
    X, y = make_classification_df(n_num_features=5, class_sep=0.7)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    with get_temp_directory() as temp_feature_path:
        cols = list(X.columns)
        for i, c in enumerate(cols):
            if X.shape[1] == 1:
                break
            save_feature(X[[c]], i, directory=temp_feature_path)
            X.drop(c, axis=1, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

        with get_temp_directory() as temp_path:
            result_wo_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path)

        with get_temp_directory() as temp_path:
            result_w_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path,
                                              feature_list=[0, 1, 2, 3], feature_directory=temp_feature_path)

        assert result_w_feature.metrics[-1] > result_wo_feature.metrics[-1]
Example #3
0
def test_experiment_sklearn_multiclass():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=0,
                                  n_classes=5,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'n_neighbors': 10}

    with get_temp_directory() as temp_path:
        result = run_experiment(params,
                                X_train,
                                y_train,
                                X_test,
                                temp_path,
                                algorithm_type=KNeighborsClassifier,
                                with_auto_prep=False)

        assert len(np.unique(result.oof_prediction[:, 0])
                   ) > 5  # making sure prediction is not binarized
        assert len(np.unique(result.test_prediction[:, 0])) > 5
        assert result.oof_prediction.shape == (len(y_train), 5)
        assert result.test_prediction.shape == (len(y_test), 5)

        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
Example #4
0
def test_experiment_sklearn_regressor():
    X, y = make_regression_df(n_samples=1024,
                              n_num_features=10,
                              n_cat_features=0,
                              random_state=0,
                              id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'fit_intercept': True}

    with get_temp_directory() as temp_path:
        result = run_experiment(params,
                                X_train,
                                y_train,
                                X_test,
                                temp_path,
                                with_auto_prep=False,
                                algorithm_type=LinearRegression)

        assert len(np.unique(result.oof_prediction)
                   ) > 5  # making sure prediction is not binarized
        assert len(np.unique(result.test_prediction)) > 5
        assert mean_squared_error(y_train,
                                  result.oof_prediction) == result.metrics[-1]

        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
Example #5
0
def test_experiment_lgb_classifier():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        result = experiment_gbdt(params,
                                 X_train,
                                 y_train,
                                 X_test,
                                 temp_path,
                                 eval_func=roc_auc_score)

        assert len(np.unique(result.oof_prediction)
                   ) > 5  # making sure prediction is not binarized
        assert len(np.unique(result.test_prediction)) > 5
        assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
        assert roc_auc_score(y_test, result.test_prediction) >= 0.9

        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
Example #6
0
def test_experiment_xgb_regressor():
    X, y = make_regression_df(n_samples=1024,
                              n_num_features=10,
                              n_cat_features=2,
                              random_state=0,
                              id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'max_depth': 8, 'num_boost_round': 100}

    with get_temp_directory() as temp_path:
        result = run_experiment(params,
                                X_train,
                                y_train,
                                X_test,
                                temp_path,
                                algorithm_type='xgb',
                                with_auto_prep=True)

        assert mean_squared_error(y_train,
                                  result.oof_prediction) == result.metrics[-1]
        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
Example #7
0
def test_experiment_sklearn_classifier():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=0,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'C': 0.1}

    with get_temp_directory() as temp_path:
        result = run_experiment(params,
                                X_train,
                                y_train,
                                X_test,
                                temp_path,
                                eval_func=roc_auc_score,
                                algorithm_type=LogisticRegression,
                                with_auto_prep=False)

        assert len(np.unique(result.oof_prediction)
                   ) > 5  # making sure prediction is not binarized
        assert len(np.unique(result.test_prediction)) > 5
        assert roc_auc_score(y_train, result.oof_prediction) >= 0.8
        assert roc_auc_score(y_test, result.test_prediction) >= 0.8

        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
Example #8
0
def test_submission_filename():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        experiment_gbdt(params,
                        X_train,
                        y_train,
                        X_test,
                        temp_path,
                        submission_filename='sub.csv')

        df = pd.read_csv(os.path.join(temp_path, 'sub.csv'))
        assert list(df.columns) == ['id', 'target']
Example #9
0
def test_experiment_mlflow():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        experiment_gbdt(params,
                        X_train,
                        y_train,
                        None,
                        temp_path,
                        with_mlflow=True)

        _check_file_exists(
            temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json'))

        # test if output files are also stored in the mlflow artifact uri
        with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f:
            mlflow_meta = json.load(f)
            p = unquote(urlparse(mlflow_meta['artifact_uri']).path)
            if os.name == 'nt' and p.startswith("/"):
                p = p[1:]
            _check_file_exists(p, ('oof_prediction.npy', 'metrics.txt'))
Example #10
0
def test_experiment_lgb_multiclass():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  n_classes=5,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'multiclass', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        result = experiment_gbdt(params, X_train, y_train, X_test, temp_path)

        assert len(np.unique(result.oof_prediction[:, 0])
                   ) > 5  # making sure prediction is not binarized
        assert len(np.unique(result.test_prediction[:, 0])) > 5
        assert result.oof_prediction.shape == (len(y_train), 5)
        assert result.test_prediction.shape == (len(y_test), 5)

        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
Example #11
0
def test_experiment_lgb_regressor():
    X, y = make_regression_df(n_samples=1024,
                              n_num_features=10,
                              n_cat_features=2,
                              random_state=0,
                              id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'regression', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        result = experiment_gbdt(params, X_train, y_train, X_test, temp_path)

        assert len(np.unique(result.oof_prediction)
                   ) > 5  # making sure prediction is not binarized
        assert len(np.unique(result.test_prediction)) > 5
        assert mean_squared_error(y_train,
                                  result.oof_prediction) == result.metrics[-1]

        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
Example #12
0
def test_experiment_manual_cv_group():
    df1 = pd.DataFrame()
    df1['x'] = np.random.randint(0, 10, size=1000)
    df1['y'] = df1['x'] > 5
    df1['grp'] = 0

    df2 = pd.DataFrame()
    df2['x'] = np.random.randint(0, 10, size=100)
    df2['y'] = df2['x'] <= 5
    df2['grp'] = 1

    X = pd.concat([df1, df2]).reset_index(drop=True)
    y = X['y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    grp = X_train['grp']
    X_train = X_train.drop(['y', 'grp'], axis=1)
    X_test = X_test.drop(['y', 'grp'], axis=1)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    with get_temp_directory() as temp_path:
        result = run_experiment(params, X_train, y_train, X_test, temp_path, cv=GroupKFold(2), groups=grp)
        assert result.metrics[-1] < 0.7
Example #13
0
def test_experiment_cat_custom_eval():
    X, y = make_regression_df(n_samples=1024,
                              n_num_features=10,
                              n_cat_features=2,
                              random_state=0,
                              id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'max_depth': 8, 'num_boost_round': 100, 'eval_metric': 'MAE'}

    with get_temp_directory() as temp_path:
        result = experiment_gbdt(params,
                                 X_train,
                                 y_train,
                                 X_test,
                                 temp_path,
                                 gbdt_type='cat',
                                 eval_func=mean_absolute_error)

        assert mean_absolute_error(y_train,
                                   result.oof_prediction) == result.metrics[-1]
        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
Example #14
0
def test_experiment_sample_submission_multiclass():
    X, y = make_classification_df(n_classes=5)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    sample_df = pd.DataFrame()
    sample_df['target_id_abc'] = np.arange(len(y_test)) + 10000
    for i in range(5):
        sample_df['target_class_{}'.format(i)] = 0

    params = {
        'objective': 'multiclass',
        'max_depth': 8
    }

    with get_temp_directory() as temp_path:
        result = run_experiment(params, X_train, y_train, X_test, temp_path, sample_submission=sample_df)

        assert list(result.submission_df.columns) == ['target_id_abc',
                                                      'target_class_0',
                                                      'target_class_1',
                                                      'target_class_2',
                                                      'target_class_3',
                                                      'target_class_4'
                                                      ]
        log_loss_trianed = log_loss(y_test, result.submission_df.drop('target_id_abc', axis=1), labels=[0, 1, 2, 3, 4])
        log_loss_default = log_loss(y_test, np.full((len(y_test), 5), 0.2), labels=[0, 1, 2, 3, 4])
        assert log_loss_trianed < log_loss_default
Example #15
0
def test_log_metrics_empty():
    with get_temp_directory() as logging_dir:
        with Experiment(logging_dir):
            pass

        with open(os.path.join(logging_dir, 'metrics.json'), 'r') as f:
            params = json.load(f)
            assert params == {}
Example #16
0
def test_feature_exists():
    df = pd.DataFrame({
        'a': [1, 2, 3, 4, 5] + [None] * 5
    })

    with get_temp_directory() as tmp:
        fs.save_feature(df[['a']], 0, directory=tmp)
        with pytest.raises(RuntimeError):
            fs.save_feature(df, 0, overwrite=False, directory=tmp)
Example #17
0
def test_save_feature():
    df = pd.DataFrame()

    df['a'] = np.arange(100)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        assert os.path.exists(os.path.join(tmp, '0.f'))
Example #18
0
def test_load_feature():
    df = pd.DataFrame()

    df['a'] = np.arange(100)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        df_loaded = fs.load_feature(0, tmp)
        assert_frame_equal(df, df_loaded)
Example #19
0
def test_with_rare_categories():
    X = pd.DataFrame({
        'x0': [None] * 100,
        'x1':
        np.random.choice([np.inf, -np.inf], size=100),
        'x2': ['nan'] + [None] * 99,
        'x3':
        np.concatenate([
            np.random.choice(['A', 'B'], size=50),
            np.random.choice(['C', 'D', 'na'], size=50)
        ])
    })

    y = pd.Series(np.random.choice([0, 1], size=100), name='y')

    params = {
        'lgbm': {
            'objective': 'binary',
            'max_depth': 8
        },
        'xgb': {
            'objective': 'binary:logistic',
            'max_depth': 8
        },
        'cat': {
            'loss_function': 'Logloss',
            'max_depth': 8
        }
    }

    for cat_cast in (True, False):
        X_ = X.copy()
        y_ = y.copy()
        if cat_cast:
            for c in X.columns:
                X_[c] = X_[c].astype('category')
            X_ = X_.iloc[:50, :]
            y_ = y_.iloc[:50]

        X_train, X_test, y_train, y_test = train_test_split(X_,
                                                            y_,
                                                            shuffle=False,
                                                            test_size=0.5)

        for algorithm in ('cat', 'xgb', 'lgbm'):
            with get_temp_directory() as temp_path:
                run_experiment(params[algorithm],
                               X_train,
                               y_train,
                               X_test,
                               algorithm_type=algorithm,
                               logging_directory=temp_path,
                               with_mlflow=True,
                               with_auto_prep=True,
                               categorical_feature=['x0', 'x1', 'x2', 'x3'])
Example #20
0
def test_log_metrics():
    with get_temp_directory() as logging_dir:
        with Experiment(logging_dir) as e:
            e.log_metric('x', 1)
            e.log_metric('x', 2)

        with open(os.path.join(logging_dir, 'metrics.json'), 'r') as f:
            params = json.load(f)

            expected = {'x': 2}
            assert params == expected
Example #21
0
def test_experiment_duplicated_rename_mlflow():
    with get_temp_directory() as logging_dir:
        with Experiment(logging_dir, with_mlflow=True) as e:
            e.log_metric('CV', 0.97)
            run_id_old = e.mlflow_run_id

        with Experiment(logging_dir, with_mlflow=True,
                        if_exists='rename') as e:
            e.log_metric('LB', 0.95)
            run_id_new = e.mlflow_run_id

        assert run_id_old != run_id_new
Example #22
0
def test_experiment_duplicated_error():
    with get_temp_directory() as logging_dir:
        with Experiment(logging_dir) as e:
            e.log_metric('CV', 0.97)

        with pytest.raises(ValueError):
            with Experiment(logging_dir):
                pass

        with pytest.raises(ValueError):
            with Experiment(logging_dir, if_exists='error'):
                pass
Example #23
0
def test_custom_experiment():
    params = {
        'objective': 'binary',
        'max_depth': 8
    }
    X, y = make_classification_df()

    with get_temp_directory() as temp_path:
        with Experiment(temp_path, with_mlflow=True) as e:
            run_experiment(params, X, y, logging_directory='foobar', inherit_experiment=e)

        # all files are logged into e.logging_directory, instead of 'foobar'
        _check_file_exists(temp_path, with_mlflow=True)
Example #24
0
def test_load_feature_ignore_all_columns():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(int)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        df_loaded = fs.load_feature(0, tmp, ignore_columns=['a', 'b', 'c', 'X'])

        assert_frame_equal(df_loaded, df.drop(['a', 'b', 'c'], axis=1))
Example #25
0
def test_load_features():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(int)

    with get_temp_directory() as tmp:
        fs.save_feature(df[['b']], 0, tmp)
        fs.save_feature(df[['c']], 1, tmp)

        df_loaded = fs.load_features(df[['a']], [0, 1], tmp)
        assert_frame_equal(df, df_loaded)
Example #26
0
def test_with_long_params():
    X, y = make_classification_df(1024, n_num_features=5, n_cat_features=400)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

    with get_temp_directory() as temp_path:
        # just to make sure experiment finish
        run_experiment(params, X_train, y_train, X_test,
                       logging_directory=temp_path, with_mlflow=True)
Example #27
0
def test_load_features_no_base():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(int)

    with get_temp_directory() as tmp:
        fs.save_feature(df[['b']], 0, tmp)
        fs.save_feature(df[['c']], 1, tmp)
        fs.save_feature(df[['a']], '2', tmp)

        df_loaded = fs.load_features(None, [0, 1, '2'], tmp)
        assert list(df_loaded.columns) == ['b', 'c', 'a']
Example #28
0
def test_experiment_duplicated_replace():
    with get_temp_directory() as logging_dir:
        with Experiment(logging_dir) as e:
            e.log_metric('CV', 0.97)

        with Experiment(logging_dir, if_exists='replace') as e:
            e.log_metric('LB', 0.95)

        with open(os.path.join(logging_dir, 'metrics.json')) as f:
            metrics = json.load(f)

            # replaced by the new result
            assert 'LB' in metrics
            assert 'CV' not in metrics
Example #29
0
def test_experiment_duplicated_append():
    with get_temp_directory() as logging_dir:
        with Experiment(logging_dir) as e:
            e.log_metric('CV', 0.97)

        with Experiment(logging_dir, if_exists='append') as e:
            e.log_metric('LB', 0.95)

        with open(os.path.join(logging_dir, 'metrics.json')) as f:
            metrics = json.load(f)

            # appended to the existing result
            assert 'LB' in metrics
            assert 'CV' in metrics
Example #30
0
def test_experiment_manual_cv_int():
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    with get_temp_directory() as temp_path:
        result = run_experiment(params, X_train, y_train, None, temp_path, cv=KFold(2))
        assert len(result.models) == 2
        assert len(result.metrics) == 2 + 1