def test_simple_model(): data = DataManager('boston') model = ParametricEstimator(LinearRegression(), LinearRegression()) y_pred = model.fit(data.X_train, data.y_train).predict(data.X_test) utils.assert_close_prediction(y_pred.point(), data.y_test, within=0.5)
def test_baseline(): data = DataManager('boston') model = ParametricEstimator() y_pred = model.fit(data.X_train, data.y_train).predict(data.X_test) mu = np.mean(data.y_train) sigma = np.std(data.y_train) # is the dummy prediction working? assert (y_pred.point() == np.ones((len(data.X_test))) * mu).all() assert (y_pred.std() == np.ones((len(data.X_test))) * sigma).all() # does subsetting work? assert len(y_pred[1:3].point()) == 2 assert len(y_pred[1:3].lp2()) == 2 # pdf, cdf? x = np.random.randint(0, 10) i = np.random.randint(0, len(data.X_test) - 1) assert y_pred[i].pdf(x) == norm.pdf(x, mu, sigma) assert y_pred[i].cdf(x) == norm.cdf(x, mu, sigma)
def test_residual_prediction(): data = DataManager('boston') baseline_model = ParametricEstimator(LinearRegression()) model = ParametricEstimator(point=LinearRegression(), std=ResidualEstimator(LinearRegression())) baseline = baseline_model.fit(data.X_train, data.y_train).predict(data.X_test) y_pred = model.fit(data.X_train, data.y_train).predict(data.X_test) baseline_loss = linearized_log_loss(data.y_test, baseline) y_pred_loss = linearized_log_loss(data.y_test, y_pred) assert baseline_loss > y_pred_loss
from skpro.workflow import Model from skpro.workflow.manager import DataManager from skpro.workflow.cross_validation import grid_optimizer from skpro.parametric import ParametricEstimator from skpro.parametric.estimators import Constant from skpro.baselines import DensityBaseline tbl = Table() # Loads and represents the data data = DataManager('boston') # Adds a model information column tbl.info() # Defines the cross validation using the log_loss metric and grid hyperparameter search tbl.cv(data, log_loss, tune=True, optimizer=grid_optimizer(n_jobs=-1, verbose=0)) # Run the models against the workflow and print the results tbl.print([ # Baseline ... Model(DensityBaseline()), # ... and parametric composite model Model( ParametricEstimator(LinearRegression(), Constant('std(y)')), # ... which hyperparameter shall be optimized tuning={'point__normalize': [True, False]}, ) ])
from sklearn.ensemble import RandomForestRegressor from sklearn.datasets.base import load_boston from sklearn.model_selection import train_test_split from skpro.parametric import ParametricEstimator from skpro.parametric.estimators import Constant from skpro.metrics import log_loss # Define the parametric model model = ParametricEstimator(point=RandomForestRegressor(), std=Constant('std(y)'), shape='norm') # Train and predict on boston housing data X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) y_pred = model.fit(X_train, y_train).predict(X_test) # Obtain the loss loss = log_loss(y_test, y_pred, sample=True, return_std=True) print('Loss: %f+-%f' % loss) # Plot the performance import sys sys.path.append('../') import utils utils.plot_performance(y_test, y_pred)
from skpro.parametric.estimators import Constant # Load the dataset data = DataManager('boston') tbl = Table() # Adding controllers displayed as columns tbl.add(InfoController(), InfoView()) for loss_func in [linearized_log_loss, log_loss]: tbl.add( controller=CrossValidationController(data, loss_func=loss_func), view=CrossValidationView() ) # Rank results tbl.modify(RankModifier()) # Sort by score in the last column, i.e. log_loss tbl.modify(SortModifier(key=lambda x: x[-1]['data']['score'])) # Use ID modifier to display model numbers tbl.modify(IdModifier()) # Compose the models displayed as rows models = [] for point_estimator in [RandomForestRegressor(), LinearRegression(), Constant('mean(y)')]: for std_estimator in [Constant('std(y)'), Constant(42)]: model = ParametricEstimator(point=point_estimator, std=std_estimator) models.append(Model(model)) tbl.print(models)
from sklearn.tree import DecisionTreeRegressor from skpro.ensemble import BaggingRegressor as SkproBaggingRegressor from skpro.metrics import log_loss as loss from skpro.parametric import ParametricEstimator from skpro.workflow.manager import DataManager def prediction(model, data): return model.fit(data.X_train, data.y_train).predict(data.X_test) data = DataManager('boston') clf = DecisionTreeRegressor() baseline_prediction = prediction(ParametricEstimator(point=clf), data) skpro_bagging_prediction = prediction( SkproBaggingRegressor(ParametricEstimator(point=clf), n_estimators=10, n_jobs=-1), data) l1, l2 = loss(data.y_test, baseline_prediction), \ loss(data.y_test, skpro_bagging_prediction) print('Baseline: ', l1) print('Bagged model:', l2)
from sklearn.ensemble import RandomForestRegressor from sklearn.datasets.base import load_boston from sklearn.model_selection import GridSearchCV from skpro.parametric import ParametricEstimator from skpro.parametric.estimators import Constant model = ParametricEstimator(point=RandomForestRegressor(), std=Constant('mean(y)')) # Initiate GridSearch meta-estimator parameters = {'point__max_depth': [None, 5, 10, 15]} clf = GridSearchCV(model, parameters) # Optimize hyperparameters X, y = load_boston(return_X_y=True) clf.fit(X, y) print('Best score is %f for parameter: %s' % (clf.best_score_, clf.best_params_)) # >>> Best score is -4.058729 for parameter: {'point__max_depth': 15}