Python make_classification_dfの例、nyaggle.testing.make_classification_df Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_averaging.py プロジェクト: tatsuya068/nyaggle

def test_averaging():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8}

    with tempfile.TemporaryDirectory() as temp_path:
        for i in range(3):
            params['seed'] = i
            ret_single = run_experiment(
                params, X_train, y_train, X_test,
                os.path.join(temp_path, 'seed{}'.format(i)))

        df = average_results(
            [os.path.join(temp_path, 'seed{}'.format(i)) for i in range(3)],
            os.path.join(temp_path, 'average.csv'))

        score = roc_auc_score(y_test, df[df.columns[-1]])
        assert score >= 0.85

        assert score >= roc_auc_score(y_test, ret_single.test_prediction)

コード例 #2

0

ファイルを表示

def test_experiment_sklearn_multiclass(tmpdir_name):
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=0,
                                  n_classes=5,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'n_neighbors': 10}

    result = run_experiment(params,
                            X_train,
                            y_train,
                            X_test,
                            tmpdir_name,
                            algorithm_type=KNeighborsClassifier,
                            with_auto_prep=False)

    assert len(np.unique(result.oof_prediction[:, 0])
               ) > 5  # making sure prediction is not binarized
    assert len(np.unique(result.test_prediction[:, 0])) > 5
    assert result.oof_prediction.shape == (len(y_train), 5)
    assert result.test_prediction.shape == (len(y_test), 5)

    _check_file_exists(tmpdir_name)

コード例 #3

0

ファイルを表示

ファイル: test_run.py プロジェクト: luyifanlu/nyaggle

def test_with_feature_attachment():
    X, y = make_classification_df(n_num_features=5, class_sep=0.7)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    with get_temp_directory() as temp_feature_path:
        cols = list(X.columns)
        for i, c in enumerate(cols):
            if X.shape[1] == 1:
                break
            save_feature(X[[c]], i, directory=temp_feature_path)
            X.drop(c, axis=1, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

        with get_temp_directory() as temp_path:
            result_wo_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path)

        with get_temp_directory() as temp_path:
            result_w_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path,
                                              feature_list=[0, 1, 2, 3], feature_directory=temp_feature_path)

        assert result_w_feature.metrics[-1] > result_wo_feature.metrics[-1]

コード例 #4

0

ファイルを表示

def test_experiment_fit_params(tmpdir_name):
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8, 'n_estimators': 500}

    result1 = run_experiment(params,
                             X_train,
                             y_train,
                             X_test,
                             os.path.join(tmpdir_name, '1'),
                             fit_params={'early_stopping_rounds': None})
    result2 = run_experiment(params,
                             X_train,
                             y_train,
                             X_test,
                             os.path.join(tmpdir_name, '2'),
                             fit_params={'early_stopping_rounds': 5})

    assert result1.models[-1].booster_.num_trees() == params['n_estimators']
    assert result2.models[-1].booster_.num_trees() < params['n_estimators']

コード例 #5

0

ファイルを表示

def test_experiment_lgb_multiclass():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  n_classes=5,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'multiclass', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        result = run_experiment(params, X_train, y_train, X_test, temp_path)

        assert len(np.unique(result.oof_prediction[:, 0])
                   ) > 5  # making sure prediction is not binarized
        assert len(np.unique(result.test_prediction[:, 0])) > 5
        assert result.oof_prediction.shape == (len(y_train), 5)
        assert result.test_prediction.shape == (len(y_test), 5)

        _check_file_exists(
            temp_path,
            ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))

コード例 #6

0

ファイルを表示

ファイル: test_run.py プロジェクト: luyifanlu/nyaggle

def test_experiment_sample_submission_multiclass():
    X, y = make_classification_df(n_classes=5)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    sample_df = pd.DataFrame()
    sample_df['target_id_abc'] = np.arange(len(y_test)) + 10000
    for i in range(5):
        sample_df['target_class_{}'.format(i)] = 0

    params = {
        'objective': 'multiclass',
        'max_depth': 8
    }

    with get_temp_directory() as temp_path:
        result = run_experiment(params, X_train, y_train, X_test, temp_path, sample_submission=sample_df)

        assert list(result.submission_df.columns) == ['target_id_abc',
                                                      'target_class_0',
                                                      'target_class_1',
                                                      'target_class_2',
                                                      'target_class_3',
                                                      'target_class_4'
                                                      ]
        log_loss_trianed = log_loss(y_test, result.submission_df.drop('target_id_abc', axis=1), labels=[0, 1, 2, 3, 4])
        log_loss_default = log_loss(y_test, np.full((len(y_test), 5), 0.2), labels=[0, 1, 2, 3, 4])
        assert log_loss_trianed < log_loss_default

コード例 #7

0

ファイルを表示

def test_submission_filename():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        run_experiment(params,
                       X_train,
                       y_train,
                       X_test,
                       temp_path,
                       submission_filename='sub.csv')

        df = pd.read_csv(os.path.join(temp_path, 'sub.csv'))
        assert list(df.columns) == ['id', 'target']

コード例 #8

0

ファイルを表示

def test_experiment_mlflow():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        run_experiment(params,
                       X_train,
                       y_train,
                       None,
                       temp_path,
                       with_mlflow=True)

        _check_file_exists(
            temp_path, ('oof_prediction.npy', 'metrics.txt', 'mlflow.json'))

        # test if output files are also stored in the mlflow artifact uri
        with open(os.path.join(temp_path, 'mlflow.json'), 'r') as f:
            mlflow_meta = json.load(f)
            p = unquote(urlparse(mlflow_meta['artifact_uri']).path)
            if os.name == 'nt' and p.startswith("/"):
                p = p[1:]
            _check_file_exists(p, ('oof_prediction.npy', 'metrics.txt'))

コード例 #9

0

ファイルを表示

ファイル: test_cross_validate.py プロジェクト: wakamezake/nyaggle

def test_cv_lgbm_df():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=20,
                                  n_cat_features=1,
                                  class_sep=0.98,
                                  random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    pred_oof, pred_test, scores, importance = cross_validate(
        models, X_train, y_train, X_test, cv=5, eval_func=roc_auc_score)

    print(scores)
    assert len(scores) == 5 + 1
    assert scores[-1] >= 0.85  # overall roc_auc
    assert roc_auc_score(y_train, pred_oof) == scores[-1]
    assert roc_auc_score(y_test, pred_test) >= 0.85  # test roc_auc
    assert roc_auc_score(y_test, models[0].predict_proba(X_test)
                         [:, 1]) >= 0.85  # make sure models are trained
    assert len(importance) == 5
    assert list(importance[0].columns) == ['feature', 'importance']
    assert len(importance[0]) == 20 + 1
    assert models[0].booster_.num_trees(
    ) < 300  # making sure early stopping worked

コード例 #10

0

ファイルを表示

def test_experiment_sklearn_classifier(tmpdir_name):
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=0,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'C': 0.1}

    result = run_experiment(params,
                            X_train,
                            y_train,
                            X_test,
                            tmpdir_name,
                            eval_func=roc_auc_score,
                            algorithm_type=LogisticRegression,
                            with_auto_prep=False)

    assert len(np.unique(
        result.oof_prediction)) > 5  # making sure prediction is not binarized
    assert len(np.unique(result.test_prediction)) > 5
    assert roc_auc_score(y_train, result.oof_prediction) >= 0.8
    assert roc_auc_score(y_test, result.test_prediction) >= 0.8

    _check_file_exists(tmpdir_name)

コード例 #11

0

ファイルを表示

def test_experiment_lgb_classifier(tmpdir_name):
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8}

    result = run_experiment(params,
                            X_train,
                            y_train,
                            X_test,
                            tmpdir_name,
                            eval_func=roc_auc_score)

    assert len(np.unique(
        result.oof_prediction)) > 5  # making sure prediction is not binarized
    assert len(np.unique(result.test_prediction)) > 5
    assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
    assert roc_auc_score(y_test, result.test_prediction) >= 0.9

    _check_file_exists(tmpdir_name)

コード例 #12

0

ファイルを表示

ファイル: test_averaging.py プロジェクト: nyanp/nyaggle

def test_averaging_with_metrics():
    X, y = make_classification_df()
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    oof, test = _make_1st_stage_preds(X_train, y_train, X_test)

    result = averaging(test, oof, y_train, eval_func=roc_auc_score)

    assert result.score == roc_auc_score(y_train, result.oof_prediction)

コード例 #13

0

ファイルを表示

ファイル: test_averaging.py プロジェクト: nyanp/nyaggle

def test_weight_averaging():
    X, y = make_classification_df()
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    oof, test = _make_1st_stage_preds(X_train, y_train, X_test)

    result = averaging(test, oof, y_train, weights=[0.2, 0.4, 0.3])

    assert_array_almost_equal(0.2 * test[0] + 0.4 * test[1] + 0.3 * test[2], result.test_prediction)
    assert_array_almost_equal(0.2 * oof[0] + 0.4 * oof[1] + 0.3 * oof[2], result.oof_prediction)
    assert result.score is None

コード例 #14

0

ファイルを表示

ファイル: test_averaging.py プロジェクト: nyanp/nyaggle

def test_averaging_multiclass():
    X, y = make_classification_df(n_classes=5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    oof, test = _make_1st_stage_preds(X_train, y_train, X_test)

    result = averaging(test, oof, y_train)

    assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction)
    assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction)
    assert result.score is None

コード例 #15

0

ファイルを表示

def test_log_params(tmpdir_name):
    params = {'objective': 'binary', 'max_depth': 8}
    X, y = make_classification_df()

    run_experiment(params, X, y, logging_directory=tmpdir_name)

    with open(os.path.join(tmpdir_name, 'params.json'), 'r') as f:
        recorded_params = json.load(f)
        assert recorded_params['model_params.max_depth'] == 8
        assert recorded_params['model_params.objective'] == 'binary'
        assert recorded_params['fit_params'] == 'None'

コード例 #16

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_custom_experiment(tmpdir_name):
    params = {
        'objective': 'binary',
        'max_depth': 8
    }
    X, y = make_classification_df()

    with Experiment(tmpdir_name, with_mlflow=True) as e:
        run_experiment(params, X, y, logging_directory='foobar', inherit_experiment=e)

    # all files are logged into e.logging_directory, instead of 'foobar'
    _check_file_exists(tmpdir_name, with_mlflow=True)

コード例 #17

0

ファイルを表示

ファイル: test_averaging.py プロジェクト: nyanp/nyaggle

def test_rank_averaging():
    X, y = make_classification_df(n_samples=1024)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    oof, test = _make_1st_stage_preds(X_train, y_train, X_test)

    result = averaging(test, rank_averaging=True)

    test_rank = [stats.rankdata(t) / len(X_test) for t in test]

    assert_array_almost_equal((test_rank[0] + test_rank[1] + test_rank[2]) / 3, result.test_prediction)
    assert result.score is None

コード例 #18

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_with_long_params(tmpdir_name):
    X, y = make_classification_df(1024, n_num_features=5, n_cat_features=400)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

    # just to make sure experiment finish
    run_experiment(params, X_train, y_train, X_test,
                   logging_directory=tmpdir_name, with_mlflow=True)

コード例 #19

0

ファイルを表示

ファイル: test_stacking.py プロジェクト: wakamezake/nyaggle

def test_stacking():
    X, y = make_classification_df()
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    oof, test = _make_1st_stage_preds(X_train, y_train, X_test)

    worst_base_roc = min(roc_auc_score(y_train, oof[0]),
                         roc_auc_score(y_train, oof[1]),
                         roc_auc_score(y_train, oof[2]))

    result = stacking(test, oof, y_train, eval_func=roc_auc_score)

    assert roc_auc_score(y_train, result.oof_prediction) > worst_base_roc

コード例 #20

0

ファイルを表示

ファイル: test_adversarial_validate.py プロジェクト: wakamezake/nyaggle

def test_adv():
    X, y = make_classification_df(1024)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    X_train['target'] = 0
    X_test['target'] = 1

    auc, importance = adversarial_validate(X_train, X_test)

    assert importance['feature'][0] == 'target'
    assert auc >= 0.9

コード例 #21

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_experiment_manual_cv_int(tmpdir_name):
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    result = run_experiment(params, X_train, y_train, None, tmpdir_name, cv=KFold(2))
    assert len(result.models) == 2
    assert len(result.metrics) == 2 + 1

コード例 #22

0

ファイルを表示

def test_experiment_manual_cv_kfold():
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    with get_temp_directory() as temp_path:
        result = run_experiment(params, X_train, y_train, None, temp_path, cv=KFold(4))
        assert len(result.models) == 4
        assert len(result.metrics) == 4 + 1

コード例 #23

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_experiment_without_test_data(tmpdir_name):
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    result = run_experiment(params, X_train, y_train, None, tmpdir_name)

    assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
    _check_file_exists(tmpdir_name)

コード例 #24

0

ファイルを表示

ファイル: test_averaging.py プロジェクト: nyanp/nyaggle

def test_rank_averaging_opt_maximize():
    X, y = make_classification_df(n_samples=1024)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    oof, test = _make_1st_stage_preds(X_train, y_train, X_test)

    best_single_model = max(roc_auc_score(y_train, oof[0]),
                            roc_auc_score(y_train, oof[1]),
                            roc_auc_score(y_train, oof[2]))

    result = averaging_opt(test, oof, y_train, roc_auc_score, higher_is_better=True, rank_averaging=True)

    assert result.score >= best_single_model

    result_simple_avg = averaging(test, oof, y_train, eval_func=roc_auc_score, rank_averaging=True)

    assert result.score >= result_simple_avg.score

コード例 #25

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_experiment_sample_submission_binary(tmpdir_name):
    X, y = make_classification_df()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    sample_df = pd.DataFrame()
    sample_df['target_id_abc'] = np.arange(len(y_test)) + 10000
    sample_df['target_value_abc'] = 0

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    result = run_experiment(params, X_train, y_train, X_test, tmpdir_name, sample_submission=sample_df)

    assert list(result.submission_df.columns) == ['target_id_abc', 'target_value_abc']
    assert roc_auc_score(y_test, result.submission_df['target_value_abc']) > 0.8

コード例 #26

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_experiment_already_exists(tmpdir_name):
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    run_experiment(params, X_train, y_train, None, tmpdir_name)

    # result is not overwrited by default
    run_experiment(params, X_train, y_train, None, tmpdir_name, if_exists='replace')

    with pytest.raises(Exception):
        run_experiment(params, X_train, y_train, None, tmpdir_name)

コード例 #27

0

ファイルを表示

def test_experiment_without_test_data():
    X, y = make_classification_df(n_samples=1024,
                                  n_num_features=10,
                                  n_cat_features=2,
                                  class_sep=0.98,
                                  random_state=0,
                                  id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    params = {'objective': 'binary', 'max_depth': 8}

    with get_temp_directory() as temp_path:
        result = experiment_gbdt(params, X_train, y_train, None, temp_path)

        assert roc_auc_score(y_train, result.oof_prediction) >= 0.9
        _check_file_exists(temp_path, ('oof_prediction.npy', 'metrics.txt'))

コード例 #28

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_ignore_errors_in_mlflow_params(tmpdir_name):
    mlflow.start_run()
    mlflow.log_param('features', 'ABC')
    mlflow.log_metric('Overall', -99)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }
    X, y = make_classification_df()

    result = run_experiment(params, X, y, with_mlflow=True, logging_directory=tmpdir_name, feature_list=[])

    client = mlflow.tracking.MlflowClient()
    data = client.get_run(mlflow.active_run().info.run_id).data

    assert data.metrics['Overall'] == result.metrics[-1]
    assert data.params['features'] == 'ABC'  # params cannot be overwritten

    mlflow.end_run()

コード例 #29

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_inherit_outer_scope_run(tmpdir_name):
    mlflow.start_run()
    mlflow.log_param('foo', 1)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }
    X, y = make_classification_df()

    run_experiment(params, X, y, with_mlflow=True, logging_directory=tmpdir_name)

    assert mlflow.active_run() is not None  # still valid

    client = mlflow.tracking.MlflowClient()
    data = client.get_run(mlflow.active_run().info.run_id).data

    assert data.metrics['Overall'] > 0  # recorded

    mlflow.end_run()

コード例 #30

0

ファイルを表示

ファイル: test_run.py プロジェクト: wakamezake/nyaggle

def test_experiment_cat_multiclass(tmpdir_name):
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2, n_classes=5,
                                  class_sep=0.98, random_state=0, id_column='user_id', target_name='tgt')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'max_depth': 8,
        'num_boost_round': 100
    }

    result = run_experiment(params, X_train, y_train, X_test, tmpdir_name, algorithm_type='cat',
                            type_of_target='multiclass', submission_filename='submission.csv', with_auto_prep=True)

    assert result.oof_prediction.shape == (len(y_train), 5)
    assert result.test_prediction.shape == (len(y_test), 5)

    assert list(pd.read_csv(os.path.join(tmpdir_name, 'submission.csv')).columns) == ['id', '0', '1', '2', '3', '4']

    _check_file_exists(tmpdir_name, submission_filename='submission.csv')