Beispiel #1
0
def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
                                storage_type, model_class):
    # skip depth 20 for dense tests
    if max_depth == 20 and not storage_type:
        return

    # settings
    classification = True  # change this to false to use regression
    n_categories = 2
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_categories,
                         random_state=random_state,
                         classification=classification)
    # identify shape and indices
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    init_kwargs = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
    }
    if model_class == RandomForestClassifier:
        init_kwargs['max_features'] = 0.3
        init_kwargs['n_jobs'] = -1
    else:
        # model_class == GradientBoostingClassifier
        init_kwargs['init'] = 'zero'

    skl_model = model_class(**init_kwargs)
    skl_model.fit(X_train, y_train)

    skl_preds = skl_model.predict(X_validation)
    skl_preds_int = np.around(skl_preds)
    skl_proba = skl_model.predict_proba(X_validation)

    skl_acc = accuracy_score(y_validation, skl_preds > 0.5)

    algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG'

    fm = ForestInference.load_from_sklearn(skl_model,
                                           algo=algo,
                                           output_class=True,
                                           threshold=0.50,
                                           storage_type=storage_type)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(skl_preds_int))

    fil_proba = np.asarray(fm.predict_proba(X_validation))
    fil_proba = np.reshape(fil_proba, np.shape(skl_proba))

    fil_acc = accuracy_score(y_validation, fil_preds)

    assert fil_acc == pytest.approx(skl_acc, abs=1e-5)
    assert array_equal(fil_preds, skl_preds_int)
    assert np.allclose(fil_proba, skl_proba, 1e-3)
Beispiel #2
0
def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
                                n_classes, storage_type, model_class):
    # settings
    classification = True  # change this to false to use regression
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_classes,
                         random_state=random_state,
                         classification=classification)
    # identify shape and indices
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    init_kwargs = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
    }
    if model_class == RandomForestClassifier:
        init_kwargs['max_features'] = 0.3
        init_kwargs['n_jobs'] = -1
    else:
        # model_class == GradientBoostingClassifier
        init_kwargs['init'] = 'zero'

    skl_model = model_class(**init_kwargs, random_state=random_state)
    skl_model.fit(X_train, y_train)

    skl_preds = skl_model.predict(X_validation)
    skl_preds_int = np.around(skl_preds)
    skl_proba = skl_model.predict_proba(X_validation)

    skl_acc = accuracy_score(y_validation, skl_preds_int)

    algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG'

    fm = ForestInference.load_from_sklearn(skl_model,
                                           algo=algo,
                                           output_class=True,
                                           threshold=0.50,
                                           storage_type=storage_type)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(skl_preds_int))
    fil_acc = accuracy_score(y_validation, fil_preds)
    # fil_acc is within p99 error bars of skl_acc (diff == 0.017 +- 0.012)
    # however, some tests have a delta as big as 0.04.
    # sklearn uses float64 thresholds, while FIL uses float32
    # TODO(levsnv): once FIL supports float64 accuracy, revisit thresholds
    threshold = 1e-5 if n_classes == 2 else 0.1
    assert fil_acc == pytest.approx(skl_acc, abs=threshold)

    if n_classes == 2:
        assert array_equal(fil_preds, skl_preds_int)
        fil_proba = np.asarray(fm.predict_proba(X_validation))
        fil_proba = np.reshape(fil_proba, np.shape(skl_proba))
        assert np.allclose(fil_proba, skl_proba, 1e-3)
Beispiel #3
0
def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth,
                            storage_type, model_class):

    # skip depth 20 for dense tests
    if max_depth == 20 and storage_type == 'DENSE':
        return

    # settings
    n_categories = 1
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_categories,
                         random_state=random_state,
                         classification=False)
    # identify shape and indices
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    init_kwargs = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
    }
    if model_class == RandomForestRegressor:
        init_kwargs['max_features'] = 0.3
        init_kwargs['n_jobs'] = -1
    else:
        # model_class == GradientBoostingRegressor
        init_kwargs['init'] = 'zero'

    skl_model = model_class(**init_kwargs)
    skl_model.fit(X_train, y_train)

    skl_preds = skl_model.predict(X_validation)

    skl_mse = mean_squared_error(y_validation, skl_preds)

    algo = 'NAIVE' if storage_type == 'SPARSE' else 'BATCH_TREE_REORG'

    fm = ForestInference.load_from_sklearn(skl_model,
                                           algo=algo,
                                           output_class=False,
                                           storage_type=storage_type)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(skl_preds))

    fil_mse = mean_squared_error(y_validation, fil_preds)

    # if fil is better than skl, no need to fail the test
    assert fil_mse <= skl_mse * (1. + 1e-7) + 1e-4
    assert array_equal(fil_preds, skl_preds)