Esempio n. 1
0
def test_simple_model():
    data = DataManager('boston')

    model = ParametricEstimator(LinearRegression(), LinearRegression())
    y_pred = model.fit(data.X_train, data.y_train).predict(data.X_test)

    utils.assert_close_prediction(y_pred.point(), data.y_test, within=0.5)
Esempio n. 2
0
def test_baseline():
    data = DataManager('boston')

    model = ParametricEstimator()
    y_pred = model.fit(data.X_train, data.y_train).predict(data.X_test)

    mu = np.mean(data.y_train)
    sigma = np.std(data.y_train)

    # is the dummy prediction working?
    assert (y_pred.point() == np.ones((len(data.X_test))) * mu).all()
    assert (y_pred.std() == np.ones((len(data.X_test))) * sigma).all()

    # does subsetting work?
    assert len(y_pred[1:3].point()) == 2
    assert len(y_pred[1:3].lp2()) == 2

    # pdf, cdf?
    x = np.random.randint(0, 10)
    i = np.random.randint(0, len(data.X_test) - 1)

    assert y_pred[i].pdf(x) == norm.pdf(x, mu, sigma)
    assert y_pred[i].cdf(x) == norm.cdf(x, mu, sigma)
Esempio n. 3
0
def test_residual_prediction():
    data = DataManager('boston')

    baseline_model = ParametricEstimator(LinearRegression())
    model = ParametricEstimator(point=LinearRegression(),
                                std=ResidualEstimator(LinearRegression()))

    baseline = baseline_model.fit(data.X_train,
                                  data.y_train).predict(data.X_test)
    y_pred = model.fit(data.X_train, data.y_train).predict(data.X_test)

    baseline_loss = linearized_log_loss(data.y_test, baseline)
    y_pred_loss = linearized_log_loss(data.y_test, y_pred)

    assert baseline_loss > y_pred_loss
Esempio n. 4
0
from skpro.workflow import Model
from skpro.workflow.manager import DataManager
from skpro.workflow.cross_validation import grid_optimizer
from skpro.parametric import ParametricEstimator
from skpro.parametric.estimators import Constant
from skpro.baselines import DensityBaseline

tbl = Table()

# Loads and represents the data
data = DataManager('boston')

# Adds a model information column
tbl.info()
# Defines the cross validation using the log_loss metric and grid hyperparameter search
tbl.cv(data,
       log_loss,
       tune=True,
       optimizer=grid_optimizer(n_jobs=-1, verbose=0))

# Run the models against the workflow and print the results
tbl.print([
    # Baseline ...
    Model(DensityBaseline()),
    # ... and parametric composite model
    Model(
        ParametricEstimator(LinearRegression(), Constant('std(y)')),
        # ... which hyperparameter shall be optimized
        tuning={'point__normalize': [True, False]},
    )
])
Esempio n. 5
0
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets.base import load_boston
from sklearn.model_selection import train_test_split

from skpro.parametric import ParametricEstimator
from skpro.parametric.estimators import Constant
from skpro.metrics import log_loss

# Define the parametric model
model = ParametricEstimator(point=RandomForestRegressor(),
                            std=Constant('std(y)'),
                            shape='norm')

# Train and predict on boston housing data
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
y_pred = model.fit(X_train, y_train).predict(X_test)

# Obtain the loss
loss = log_loss(y_test, y_pred, sample=True, return_std=True)
print('Loss: %f+-%f' % loss)

# Plot the performance
import sys
sys.path.append('../')
import utils
utils.plot_performance(y_test, y_pred)
Esempio n. 6
0
from skpro.parametric.estimators import Constant

# Load the dataset
data = DataManager('boston')

tbl = Table()

# Adding controllers displayed as columns
tbl.add(InfoController(), InfoView())

for loss_func in [linearized_log_loss, log_loss]:
    tbl.add(
        controller=CrossValidationController(data, loss_func=loss_func),
        view=CrossValidationView()
    )

# Rank results
tbl.modify(RankModifier())
# Sort by score in the last column, i.e. log_loss
tbl.modify(SortModifier(key=lambda x: x[-1]['data']['score']))
# Use ID modifier to display model numbers
tbl.modify(IdModifier())

# Compose the models displayed as rows
models = []
for point_estimator in [RandomForestRegressor(), LinearRegression(), Constant('mean(y)')]:
    for std_estimator in [Constant('std(y)'), Constant(42)]:
        model = ParametricEstimator(point=point_estimator, std=std_estimator)
        models.append(Model(model))

tbl.print(models)
Esempio n. 7
0
from sklearn.tree import DecisionTreeRegressor

from skpro.ensemble import BaggingRegressor as SkproBaggingRegressor
from skpro.metrics import log_loss as loss
from skpro.parametric import ParametricEstimator
from skpro.workflow.manager import DataManager


def prediction(model, data):
    return model.fit(data.X_train, data.y_train).predict(data.X_test)


data = DataManager('boston')
clf = DecisionTreeRegressor()

baseline_prediction = prediction(ParametricEstimator(point=clf), data)

skpro_bagging_prediction = prediction(
    SkproBaggingRegressor(ParametricEstimator(point=clf),
                          n_estimators=10,
                          n_jobs=-1), data)

l1, l2 = loss(data.y_test, baseline_prediction), \
         loss(data.y_test, skpro_bagging_prediction)

print('Baseline: ', l1)
print('Bagged model:', l2)
Esempio n. 8
0
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets.base import load_boston
from sklearn.model_selection import GridSearchCV

from skpro.parametric import ParametricEstimator
from skpro.parametric.estimators import Constant

model = ParametricEstimator(point=RandomForestRegressor(),
                            std=Constant('mean(y)'))

# Initiate GridSearch meta-estimator
parameters = {'point__max_depth': [None, 5, 10, 15]}
clf = GridSearchCV(model, parameters)

# Optimize hyperparameters
X, y = load_boston(return_X_y=True)
clf.fit(X, y)

print('Best score is %f for parameter: %s' %
      (clf.best_score_, clf.best_params_))
# >>> Best score is -4.058729 for parameter: {'point__max_depth': 15}