Beispiel #1
0
def test_sklearn_estimator_regression_validation():
    ds = vaex.ml.datasets.load_iris()
    train, test = ds.ml.train_test_split(verbose=False)
    features = ['sepal_length', 'sepal_width', 'petal_length']

    # Dense features
    Xtrain = train[features].values
    Xtest = test[features].values
    ytrain = train.petal_width.values

    for model in models_regression:

        # vaex
        vaex_model = SKLearnPredictor(model=model,
                                      features=features,
                                      prediction_name='pred')
        vaex_model.fit(train, train.petal_width)
        test = vaex_model.transform(test)

        # sklearn
        model.fit(Xtrain, ytrain)
        skl_pred = model.predict(Xtest)

        np.testing.assert_array_almost_equal(test.pred.values,
                                             skl_pred,
                                             decimal=5)
Beispiel #2
0
def test_sklearn_estimator_classification_validation():
    ds = vaex.ml.datasets.load_titanic()

    train, test = ds.ml.train_test_split(verbose=False)
    features = ['pclass', 'parch', 'sibsp']

    # Dense features
    Xtrain = train[features].values
    Xtest = test[features].values
    ytrain = train.survived.values

    for model in models_classification:

        # vaex
        vaex_model = SKLearnPredictor(model=model,
                                      features=features,
                                      prediction_name='pred')
        vaex_model.fit(train, train.survived)
        test = vaex_model.transform(test)

        # scikit-learn
        model.fit(Xtrain, ytrain)
        skl_pred = model.predict(Xtest)

        assert np.all(skl_pred == test.pred.values)
Beispiel #3
0
def test_sklearn_estimator_pipeline():
    ds = vaex.ml.datasets.load_iris()
    train, test = ds.ml.train_test_split(verbose=False)
    # Add virtual columns
    train['sepal_virtual'] = np.sqrt(train.sepal_length**2 +
                                     train.sepal_width**2)
    train['petal_scaled'] = train.petal_length * 0.2
    # Do a pca
    features = ['sepal_virtual', 'petal_scaled']
    pca = train.ml.pca(n_components=2, features=features)
    train = pca.transform(train)
    # Do state transfer
    st = vaex.ml.state_transfer(train)
    # now apply the model
    features = ['sepal_virtual', 'petal_scaled']
    model = SKLearnPredictor(model=LinearRegression(),
                             features=features,
                             prediction_name='pred')
    model.fit(train, train.petal_width)
    # Create a pipeline
    pipeline = vaex.ml.Pipeline([st, model])
    # Use the pipeline
    pred = pipeline.predict(test)
    df_trans = pipeline.transform(test)

    # WARNING: on windows/appveyor this gives slightly different results
    # do we fully understand why? I also have the same results on my osx laptop
    # sklearn 0.21.1 (scikit-learn-0.21.2 is installed on windows) so it might be a
    # version related thing
    np.testing.assert_array_almost_equal(pred, df_trans.pred.values)
Beispiel #4
0
def test_sklearn_estimator_virtual_columns():
    ds = vaex.ml.datasets.load_iris()
    ds['x'] = ds.sepal_length * 1
    ds['y'] = ds.sepal_width * 1
    ds['w'] = ds.petal_length * 1
    ds['z'] = ds.petal_width * 1
    train, test = ds.ml.train_test_split(test_size=0.2, verbose=False)
    features = ['x', 'y', 'z']
    model = SKLearnPredictor(model=LinearRegression(),
                             features=features,
                             prediction_name='pred')
    model.fit(ds, ds.w)
    ds = model.transform(ds)
    assert ds.pred.values.shape == (150, )
Beispiel #5
0
def test_sklearn_estimator():
    ds = vaex.ml.datasets.load_iris()
    features = ['sepal_length', 'sepal_width', 'petal_length']

    train, test = ds.ml.train_test_split(verbose=False)

    model = SKLearnPredictor(model=LinearRegression(),
                             features=features,
                             prediction_name='pred')
    model.fit(train, train.petal_width)
    prediction = model.predict(test)
    test = model.transform(test)
    np.testing.assert_array_almost_equal(test.pred.values,
                                         prediction,
                                         decimal=5)

    # Transfer the state of train to ds
    train = model.transform(train)
    state = train.state_get()
    ds.state_set(state)
    assert ds.pred.values.shape == (150, )
Beispiel #6
0
def test_sklearn_estimator_serialize(tmpdir):
    ds = vaex.ml.datasets.load_iris()
    features = ['sepal_length', 'sepal_width', 'petal_length']

    model = SKLearnPredictor(model=LinearRegression(),
                             features=features,
                             prediction_name='pred')
    model.fit(ds, ds.petal_width)

    pipeline = vaex.ml.Pipeline([model])
    pipeline.save(str(tmpdir.join('test.json')))
    pipeline.load(str(tmpdir.join('test.json')))

    model = SKLearnPredictor(model=LinearRegression(),
                             features=features,
                             prediction_name='pred')
    model.fit(ds, ds.petal_width)

    model.state_set(model.state_get())
    pipeline = vaex.ml.Pipeline([model])
    pipeline.save(str(tmpdir.join('test.json')))
    pipeline.load(str(tmpdir.join('test.json')))