Esempio n. 1
0
def test_out_of_time_and_space_splitter():
    result, logs = out_of_time_and_space_splitter(
        sample_data,
        2,
        '2015-05-05',
        time_column='time',
        space_column='space',
        holdout_gap=timedelta(days=31))

    assert len(result) == 2
    train_1 = sample_data.iloc[result[0][0]]
    test_1 = sample_data.iloc[result[0][1][0]]
    train_2 = sample_data.iloc[result[1][0]]
    test_2 = sample_data.iloc[result[1][1][0]]

    # there must be no overlap in space between folds
    assert len(train_1[train_1.space.isin(train_2.space)]) == 0
    assert len(train_2[train_2.space.isin(train_1.space)]) == 0

    # the training sets must have no dates after 2015-05-05
    assert len(train_1[train_1.time > '2015-05-05']) == 0
    assert len(train_2[train_2.time > '2015-05-05']) == 0

    # the test sets must have no dates before 2015-05-05 + 31 days
    assert len(test_1[test_1.time <= '2015-06-05']) == 0
    assert len(test_2[test_2.time <= '2015-06-05']) == 0

    # all rows with time before '2015-05-05' must be in a training set
    assert len(train_1) + len(train_2) == len(
        sample_data[sample_data.time <= '2015-05-05'])
Esempio n. 2
0
def test_grid_search_tuner(tmpdir):
    train_set = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id3"],
        'date':
        pd.to_datetime(
            ["2016-01-01", "2016-02-01", "2016-03-01", "2016-04-01"]),
        'x': [.2, .9, .3, .3],
        'target': [0, 1, 0, 1]
    })

    eval_fn = auc_evaluator(target_column="target")

    space = {
        'learning_rate': lambda: [1e-3, 1e-2, 1e-1],
        'num_estimators': lambda: [1, 2],
        'silent': lambda: [True]
    }

    @curry
    def param_train_fn(space, train_set):
        return xgb_classification_learner(
            features=["x"],
            target="target",
            learning_rate=space["learning_rate"],
            num_estimators=space["num_estimators"])(train_set)

    split_fn = out_of_time_and_space_splitter(n_splits=2,
                                              in_time_limit="2016-05-01",
                                              space_column="id",
                                              time_column="date")

    tuning_log = grid_search_cv(space=space,
                                train_set=train_set,
                                param_train_fn=param_train_fn,
                                split_fn=split_fn,
                                eval_fn=eval_fn)

    assert len(tuning_log) == 3 * 2

    space = {
        'learning_rate': lambda: [1e-3, 1e-2, 1e-1, 1],
        'num_estimators': lambda: [1, 2],
        'silent': lambda: [True]
    }

    tuning_log = grid_search_cv(space=space,
                                train_set=train_set,
                                param_train_fn=param_train_fn,
                                split_fn=split_fn,
                                eval_fn=eval_fn)

    assert len(tuning_log) == 4 * 2
Esempio n. 3
0
def test_extract():
    boston = load_boston()
    df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
    df['target'] = boston['target']
    df['time'] = pd.date_range(start='2015-01-01', periods=len(df))
    np.random.seed(42)
    df['space'] = np.random.randint(0, 100, size=len(df))

    # Define train function
    train_fn = linear_regression_learner(
        features=boston['feature_names'].tolist(), target="target")

    # Define evaluator function
    base_evaluator = combined_evaluators(evaluators=[
        r2_evaluator(target_column='target', prediction_column='prediction'),
        spearman_evaluator(target_column='target',
                           prediction_column='prediction')
    ])

    splitter = split_evaluator(eval_fn=base_evaluator,
                               split_col='RAD',
                               split_values=[4.0, 5.0, 24.0])
    temporal_week_splitter = temporal_split_evaluator(eval_fn=base_evaluator,
                                                      time_col='time',
                                                      time_format='%Y-%W')
    temporal_year_splitter = temporal_split_evaluator(eval_fn=base_evaluator,
                                                      time_col='time',
                                                      time_format='%Y')

    eval_fn = combined_evaluators(evaluators=[base_evaluator, splitter])
    temporal_week_eval_fn = combined_evaluators(
        evaluators=[base_evaluator, temporal_week_splitter])
    temporal_year_eval_fn = combined_evaluators(
        evaluators=[base_evaluator, temporal_year_splitter])

    # Define splitters
    cv_split_fn = out_of_time_and_space_splitter(n_splits=5,
                                                 in_time_limit='2016-01-01',
                                                 time_column='time',
                                                 space_column='space')

    tlc_split_fn = time_learning_curve_splitter(
        training_time_limit='2016-01-01', time_column='time', min_samples=0)

    sc_split_fn = stability_curve_time_splitter(
        training_time_limit='2016-01-01', time_column='time', min_samples=0)

    fw_sc_split_fn = forward_stability_curve_time_splitter(
        training_time_start="2015-01-01",
        training_time_end="2016-01-01",
        holdout_gap=timedelta(days=30),
        holdout_size=timedelta(days=30),
        step=timedelta(days=30),
        time_column='time')

    # Validate results
    cv_results = validator(df, cv_split_fn, train_fn, eval_fn)['validator_log']
    tlc_results = validator(df, tlc_split_fn, train_fn,
                            eval_fn)['validator_log']
    sc_results = validator(df, sc_split_fn, train_fn, eval_fn)['validator_log']
    fw_sc_results = validator(df, fw_sc_split_fn, train_fn,
                              eval_fn)['validator_log']

    # temporal evaluation results
    predict_fn, _, _ = train_fn(df)
    temporal_week_results = temporal_week_eval_fn(predict_fn(df))
    temporal_year_results = temporal_year_eval_fn(predict_fn(df))

    # Define extractors
    base_extractors = combined_evaluator_extractor(base_extractors=[
        evaluator_extractor(evaluator_name="r2_evaluator__target"),
        evaluator_extractor(evaluator_name="spearman_evaluator__target")
    ])

    splitter_extractor = split_evaluator_extractor(
        split_col='RAD',
        split_values=[4.0, 5.0, 24.0],
        base_extractor=base_extractors)

    temporal_week_splitter_extractor = temporal_split_evaluator_extractor(
        time_col='time', time_format='%Y-%W', base_extractor=base_extractors)

    temporal_year_splitter_extractor = temporal_split_evaluator_extractor(
        time_col='time', time_format='%Y', base_extractor=base_extractors)

    assert extract(cv_results, base_extractors).shape == (5, 9)
    assert extract(cv_results, splitter_extractor).shape == (15, 10)

    assert extract(tlc_results, base_extractors).shape == (12, 9)
    assert extract(tlc_results, splitter_extractor).shape == (36, 10)

    assert extract(sc_results, base_extractors).shape == (5, 9)
    assert extract(sc_results, splitter_extractor).shape == (15, 10)

    assert extract(fw_sc_results, base_extractors).shape == (3, 9)
    assert extract(fw_sc_results, splitter_extractor).shape == (9, 10)

    n_time_week_folds = len(df['time'].dt.strftime('%Y-%W').unique())
    n_time_year_folds = len(df['time'].dt.strftime('%Y').unique())
    assert temporal_week_splitter_extractor(temporal_week_results).shape == (
        n_time_week_folds, 3)
    assert temporal_year_splitter_extractor(temporal_year_results).shape == (
        n_time_year_folds, 3)