def test_Pipegraph__example_1_no_connections(self): import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LinearRegression from pipegraph import PipeGraphRegressor X = np.random.rand(100, 1) y = 4 * X + 0.5 * np.random.randn(100, 1) scaler = MinMaxScaler() linear_model = LinearRegression() steps = [('scaler', scaler), ('linear_model', linear_model)] pgraph = PipeGraphRegressor(steps=steps) self.assertTrue(pgraph._pipegraph.fit_connections is None) self.assertTrue(pgraph._pipegraph.predict_connections is None) pgraph.fit(X, y) y_pred = pgraph.predict(X) self.assertEqual(y_pred.shape[0], y.shape[0]) self.assertEqual( pgraph._pipegraph.fit_connections, dict(scaler={'X': 'X'}, linear_model={ 'X': ('scaler', 'predict'), 'y': 'y' })) self.assertEqual( pgraph._pipegraph.predict_connections, dict(scaler={'X': 'X'}, linear_model={ 'X': ('scaler', 'predict'), 'y': 'y' }))
def setUp(self): self.size = 100 self.X = pd.DataFrame(dict(X=np.random.rand(self.size, ))) self.y = pd.DataFrame(dict(y=(np.random.rand(self.size, )))) sc = MinMaxScaler() lm = LinearRegression() neutral_regressor = NeutralRegressor() steps = [ ('scaler', sc), ('model', lm), ] connections = { 'scaler': { 'X': 'X' }, 'model': { 'X': ('scaler', 'predict'), 'y': 'y' }, } model = PipeGraphRegressor(steps, connections) steps = [('scaler', sc), ('model', lm), ('neutral', neutral_regressor)] connections = { 'scaler': { 'X': 'X' }, 'model': { 'X': ('scaler', 'predict'), 'y': 'y' }, 'neutral': { 'X': 'model' } } model_custom = PipeGraphRegressor(steps, connections) self.sc = sc self.lm = lm self.model = model self.model_custom = model_custom
def setUp(self): X_first = pd.Series(np.random.rand(1000, )) y_first = pd.Series(4 * X_first + 0.5 * np.random.randn(1000, )) X_second = pd.Series(np.random.rand(1000, ) + 3) y_second = pd.Series(-4 * X_second + 0.5 * np.random.randn(1000, )) X_third = pd.Series(np.random.rand(1000, ) + 6) y_third = pd.Series(2 * X_third + 0.5 * np.random.randn(1000, )) self.X = pd.concat([X_first, X_second, X_third], axis=0).to_frame() self.y = pd.concat([y_first, y_second, y_third], axis=0).to_frame() scaler = MinMaxScaler() gaussian_mixture = GaussianMixture(n_components=3) models = RegressorsWithDataDependentNumberOfReplicas( steps=[('regressor', LinearRegression())]) neutral_regressor = NeutralRegressor() steps = [('scaler', scaler), ('classifier', gaussian_mixture), ('models', models), ('neutral', neutral_regressor)] connections = { 'scaler': { 'X': 'X' }, 'classifier': { 'X': 'scaler' }, 'models': { 'X': 'scaler', 'y': 'y', 'selection': 'classifier' }, 'neutral': { 'X': 'models' }, } self.pgraph = PipeGraphRegressor(steps=steps, fit_connections=connections) self.pgraph.fit(self.X, self.y)
models = RegressorsWithParametrizedNumberOfReplicas( number_of_replicas=3, model_prototype=LinearRegression(), model_parameters={}) steps = [ ('scaler', scaler), ('classifier', gaussian_mixture), ('models', models), ] connections = { 'scaler': { 'X': 'X' }, 'classifier': { 'X': 'scaler' }, 'models': { 'X': 'scaler', 'y': 'y', 'selection': 'classifier' }, } pgraph = PipeGraphRegressor(steps=steps, fit_connections=connections) pgraph.fit(X, y) y_pred = pgraph.predict(X) plt.scatter(X, y) plt.scatter(X, y_pred)
def test_Pipegraph__ex_3_inject(self): import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from pipegraph.base import PipeGraphRegressor from pipegraph.demo_blocks import CustomPower X = pd.DataFrame( dict(X=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), sample_weight=np.array([ 0.01, 0.95, 0.10, 0.95, 0.95, 0.10, 0.10, 0.95, 0.95, 0.95, 0.01 ]))) y = np.array([10, 4, 20, 16, 25, -60, 85, 64, 81, 100, 150]) scaler = MinMaxScaler() polynomial_features = PolynomialFeatures() linear_model = LinearRegression() custom_power = CustomPower() selector = ColumnSelector(mapping={ 'X': slice(0, 1), 'sample_weight': slice(1, 2) }) steps = [('selector', selector), ('custom_power', custom_power), ('scaler', scaler), ('polynomial_features', polynomial_features), ('linear_model', linear_model)] pgraph = PipeGraphRegressor(steps=steps) self.assertTrue(pgraph._pipegraph.fit_connections is None) self.assertTrue(pgraph._pipegraph.predict_connections is None) (pgraph.inject( sink='selector', sink_var='X', source='_External', source_var='X').inject( 'custom_power', 'X', 'selector', 'sample_weight').inject('scaler', 'X', 'selector', 'X').inject( 'polynomial_features', 'X', 'scaler').inject( 'linear_model', 'X', 'polynomial_features').inject('linear_model', 'y', source_var='y').inject( 'linear_model', 'sample_weight', 'custom_power')) self.assertTrue(pgraph._pipegraph.fit_connections is not None) self.assertTrue(pgraph._pipegraph.predict_connections is not None) pgraph.fit(X, y) self.assertEqual( pgraph._pipegraph.fit_connections, { 'selector': { 'X': ('_External', 'X') }, 'custom_power': { 'X': ('selector', 'sample_weight') }, 'scaler': { 'X': ('selector', 'X') }, 'polynomial_features': { 'X': ('scaler', 'predict') }, 'linear_model': { 'X': ('polynomial_features', 'predict'), 'y': ('_External', 'y'), 'sample_weight': ('custom_power', 'predict') } }) self.assertEqual( pgraph._pipegraph.predict_connections, { 'selector': { 'X': ('_External', 'X') }, 'custom_power': { 'X': ('selector', 'sample_weight') }, 'scaler': { 'X': ('selector', 'X') }, 'polynomial_features': { 'X': ('scaler', 'predict') }, 'linear_model': { 'X': ('polynomial_features', 'predict'), 'y': ('_External', 'y'), 'sample_weight': ('custom_power', 'predict') } })
'X': 'X' }, 'classifier': { 'X': 'scaler' }, 'models': { 'X': 'scaler', 'y': 'y', 'selection': 'classifier' }, 'neutral': { 'X': 'models' } } pgraph = PipeGraphRegressor(steps=steps, fit_connections=connections) ############################################################################################################## # Using GridSearchCV to find the best number of clusters and the best regressors # from sklearn.model_selection import GridSearchCV param_grid = {'classifier__n_components': range(2, 10)} gs = GridSearchCV(estimator=pgraph, param_grid=param_grid, refit=True) gs.fit(X_train, y_train) y_pred = gs.predict(X_train) plt.scatter(X_train, y_train) plt.scatter(X_train, y_pred) print("Score:", gs.score(X_test, y_test)) print("classifier__n_components:", gs.best_estimator_.get_params()['classifier__n_components'])
############################################################################### # Secondly, we define the steps and a ``param_grid`` dictionary as specified by :class:`GridSearchCV`. # In this case we just want to explore a few possibilities varying the degree of the polynomials and whether to use or not an intercept at the linear model. steps = [('scaler', scaler), ('polynomial_features', polynomial_features), ('linear_model', linear_model)] param_grid = { 'polynomial_features__degree': range(1, 11), 'linear_model__fit_intercept': [True, False] } ############################################################################### # Now, we use ``PipeGraphRegressor`` as estimator for :class:`GridSearchCV` and perform the ``fit`` and ``predict`` operations. pgraph = PipeGraphRegressor(steps=steps) grid_search_regressor = GridSearchCV(estimator=pgraph, param_grid=param_grid, refit=True) grid_search_regressor.fit(X, y) y_pred = grid_search_regressor.predict(X) plt.scatter(X, y) plt.scatter(X, y_pred) plt.show() coef = grid_search_regressor.best_estimator_.get_params()['linear_model'].coef_ degree = grid_search_regressor.best_estimator_.get_params( )['polynomial_features'].degree print(
class TestModelsWithDataDependentNumberOfReplicas(unittest.TestCase): def setUp(self): X_first = pd.Series(np.random.rand(1000, )) y_first = pd.Series(4 * X_first + 0.5 * np.random.randn(1000, )) X_second = pd.Series(np.random.rand(1000, ) + 3) y_second = pd.Series(-4 * X_second + 0.5 * np.random.randn(1000, )) X_third = pd.Series(np.random.rand(1000, ) + 6) y_third = pd.Series(2 * X_third + 0.5 * np.random.randn(1000, )) self.X = pd.concat([X_first, X_second, X_third], axis=0).to_frame() self.y = pd.concat([y_first, y_second, y_third], axis=0).to_frame() scaler = MinMaxScaler() gaussian_mixture = GaussianMixture(n_components=3) models = RegressorsWithDataDependentNumberOfReplicas( steps=[('regressor', LinearRegression())]) neutral_regressor = NeutralRegressor() steps = [('scaler', scaler), ('classifier', gaussian_mixture), ('models', models), ('neutral', neutral_regressor)] connections = { 'scaler': { 'X': 'X' }, 'classifier': { 'X': 'scaler' }, 'models': { 'X': 'scaler', 'y': 'y', 'selection': 'classifier' }, 'neutral': { 'X': 'models' }, } self.pgraph = PipeGraphRegressor(steps=steps, fit_connections=connections) self.pgraph.fit(self.X, self.y) def test_ModelsWithDataDependentNumberOfReplicas__connections(self): X = self.X y = self.y pgraph = self.pgraph pgraph.fit(X, y) y_pred = pgraph.predict(X) self.assertTrue( isinstance(pgraph.named_steps['models'], RegressorsWithDataDependentNumberOfReplicas)) result_connections = pgraph.named_steps[ 'models']._pipegraph.fit_connections expected_connections = { 'regressorsBundle': { 'X': 'X', 'selection': 'selection', 'y': 'y' } } self.assertEqual(result_connections, expected_connections) result_steps = sorted(list(pgraph.named_steps.keys())) expected_steps = sorted(['scaler', 'classifier', 'models', 'neutral']) self.assertEqual(result_steps, expected_steps) self.assertEqual(y_pred.shape[0], y.shape[0]) def test_ModelsWithDataDependentNumberOfReplicas__predict(self): X = self.X y = self.y pgraph = self.pgraph pgraph.fit(X, y) y_pred = pgraph.predict(X) self.assertEqual(y_pred.shape[0], y.shape[0]) def test_ModelsWithDataDependentNumberOfReplicas__score(self): X = self.X y = self.y pgraph = self.pgraph pgraph.fit(X, y) result = pgraph.score(X, y) self.assertTrue(result > -42) def test_ModelsWithDataDependentNumberOfReplicas__GridSearchCV(self): X = self.X y = self.y X_train, X_test, y_train, y_test = train_test_split(X, y) pgraph = self.pgraph param_grid = {'classifier__n_components': range(2, 10)} gs = GridSearchCV(estimator=pgraph, param_grid=param_grid, refit=True) gs.fit(X_train, y_train) result = gs.score(X_test, y_test) self.assertTrue(result > -42)
# Next we define the steps and we use :class:`PipeGraphRegressor` as estimator for :class:`GridSearchCV`. scaler = MinMaxScaler() polynomial_features = PolynomialFeatures() linear_model = LinearRegression() custom_power = CustomPower() selector = ColumnSelector(mapping={ 'X': slice(0, 1), 'sample_weight': slice(1, 2) }) steps = [('selector', selector), ('custom_power', custom_power), ('scaler', scaler), ('polynomial_features', polynomial_features), ('linear_model', linear_model)] pgraph = PipeGraphRegressor(steps=steps) (pgraph.inject( sink='selector', sink_var='X', source='_External', source_var='X').inject( 'custom_power', 'X', 'selector', 'sample_weight').inject('scaler', 'X', 'selector', 'X').inject( 'polynomial_features', 'X', 'scaler').inject( 'linear_model', 'X', 'polynomial_features').inject('linear_model', 'y', source_var='y').inject( 'linear_model', 'sample_weight', 'custom_power')) ###############################################################################