def setUp(self): self.size = 100 self.X = pd.DataFrame(dict(X=np.random.rand(self.size, ))) self.y = pd.DataFrame( dict(y=(np.random.rand(self.size, ) > 0.5).astype(int))) sc = MinMaxScaler() nb = GaussianNB() steps = [ ('scaler', sc), ('model', nb), ] connections = { 'scaler': { 'X': 'X' }, 'model': { 'X': ('scaler', 'predict'), 'y': 'y' }, } model = PipeGraph(steps, connections) self.sc = sc self.nb = nb self.model = model
def test_Pipegraph__example_1_no_connections(self): import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LinearRegression from pipegraph import PipeGraph X = np.random.rand(100, 1) y = 4 * X + 0.5 * np.random.randn(100, 1) scaler = MinMaxScaler() linear_model = LinearRegression() steps = [('scaler', scaler), ('linear_model', linear_model)] pgraph = PipeGraph(steps=steps) self.assertTrue(pgraph.fit_connections is None) self.assertTrue(pgraph.predict_connections is None) pgraph.fit(X, y) y_pred = pgraph.predict(X) self.assertEqual(y_pred.shape[0], y.shape[0]) self.assertEqual( pgraph.fit_connections, dict(scaler={'X': 'X'}, linear_model={ 'X': ('scaler', 'predict'), 'y': 'y' })) self.assertEqual( pgraph.predict_connections, dict(scaler={'X': 'X'}, linear_model={ 'X': ('scaler', 'predict'), 'y': 'y' }))
def setUp(self): X_first = pd.Series(np.random.rand(100, )) y_first = pd.Series(4 * X_first + 0.5 * np.random.randn(100, )) X_second = pd.Series(np.random.rand(100, ) + 3) y_second = pd.Series(-4 * X_second + 0.5 * np.random.randn(100, )) X_third = pd.Series(np.random.rand(100, ) + 6) y_third = pd.Series(2 * X_third + 0.5 * np.random.randn(100, )) self.X = pd.concat([X_first, X_second, X_third], axis=0).to_frame() self.y = pd.concat([y_first, y_second, y_third], axis=0).to_frame() scaler = MinMaxScaler() gaussian_mixture = GaussianMixture(n_components=3) models = RegressorsWithParametrizedNumberOfReplicas(number_of_replicas=3, regressor=LinearRegression()) neutral_regressor = NeutralRegressor() steps = [('scaler', scaler), ('classifier', gaussian_mixture), ('models', models), ('neutral', neutral_regressor)] connections = {'scaler': {'X': 'X'}, 'classifier': {'X': 'scaler'}, 'models': {'X': 'scaler', 'y': 'y', 'selection': 'classifier'}, 'neutral': {'X': 'models'} } self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
def setUp(self): self.size = 100 self.X = np.random.rand(self.size, 1) self.y = 2 * self.X lm = LinearRegression() steps = [('linear_model', lm)] self.lm = lm self.steps = steps self.pgraph = PipeGraph(steps=steps)
def setUp(self): self.size = 1000 self.X = pd.DataFrame(dict(X=np.random.rand(self.size, ))) self.y = pd.DataFrame(dict(y=(np.random.rand(self.size, )))) concatenator = Concatenator() gaussian_clustering = GaussianMixture(n_components=3) dbscan = DBSCAN(eps=0.5) mixer = CustomCombination() linear_model = LinearRegression() steps = [ ('Concatenate_Xy', concatenator), ('Gaussian_Mixture', gaussian_clustering), ('Dbscan', dbscan), ('Combine_Clustering', mixer), ('Regressor', linear_model), ] connections = { 'Concatenate_Xy': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), 'Combine_Clustering': dict(dominant=('Dbscan', 'predict'), other=('Gaussian_Mixture', 'predict')), 'Regressor': dict(X='X', y='y') } self.steps_external = [ ('_External', concatenator), ('Gaussian_Mixture', gaussian_clustering), ('Dbscan', dbscan), ('Combine_Clustering', mixer), ('Regressor', linear_model), ] self.connections_external = { '_External': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), 'Combine_Clustering': dict(dominant=('Dbscan', 'predict'), other=('Gaussian_Mixture', 'predict')), 'Regressor': dict(X='X', y='y') } self.steps = steps self.connections = connections self.pgraph = PipeGraph(steps=steps, fit_connections=connections) self.pgraph.fit(self.X, self.y)
def test_Pipegraph__filter_nodes_predict(self): alternative_connections = {'Regressor': dict(X='X', y='y')} pgraph = PipeGraph(steps=self.steps, fit_connections=self.connections, predict_connections=alternative_connections) pgraph.fit(self.X, self.y) predict_nodes = list(pgraph._filter_predict_nodes()) self.assertEqual(predict_nodes, ['Regressor'])
def test_compositable__isinstance(self): X = self.X y = self.y new_graph = PipeGraph(steps=[('pgraph', self.pgraph)]) self.assertEqual(new_graph.named_steps, {'pgraph': self.pgraph}) new_graph.fit(X, y) result = new_graph.predict(X)['predict'] expected = self.pgraph.predict(X)['predict'] self.assertEqual(result.shape[0], expected.shape[0])
def setUp(self): self.size = 100 self.X = pd.DataFrame(dict(X=np.random.rand(self.size, ))) self.y = pd.DataFrame(dict(y=(np.random.rand(self.size, )))) sc = MinMaxScaler() lm = LinearRegression() neutral_regressor = NeutralRegressor() steps = [ ('scaler', sc), ('model', lm), ] connections = { 'scaler': { 'X': 'X' }, 'model': { 'X': ('scaler', 'predict'), 'y': 'y' }, } model = PipeGraph(steps, connections) steps = [('scaler', sc), ('model', lm), ('neutral', neutral_regressor)] connections = { 'scaler': { 'X': 'X' }, 'model': { 'X': ('scaler', 'predict'), 'y': 'y' }, 'neutral': { 'X': 'model' } } model_custom = PipeGraph(steps, connections) self.sc = sc self.lm = lm self.model = model self.model_custom = model_custom
def test_Pipegraph__predict_connections(self): pgraph = PipeGraph(self.steps, self.connections) pgraph.fit(self.X, self.y) predict_nodes_list = list(pgraph._filter_predict_nodes()) self.assertEqual( sorted(predict_nodes_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', 'Combine_Clustering', 'Regressor', ]))
def setUp(self): self.size = 100 self.X = np.random.rand(self.size, 1) self.y = 2 * self.X lm = LinearRegression() steps = [('linear_model', lm)] connections = {'linear_model': dict(X='X', y='y')} self.lm = lm self.steps = steps self.connections = connections self.pgraph = PipeGraph(steps=steps, fit_connections=connections) self.param_grid = dict(linear_model__fit_intercept=[False, True], linear_model__normalize=[True, False])
def setUp(self): self.size = 100 self.X = pd.DataFrame(dict(X=np.random.rand(self.size, ))) self.y = pd.DataFrame(dict(y=np.random.rand(self.size, ))) concatenator = Concatenator() gaussian_clustering = GaussianMixture(n_components=3) dbscan = DBSCAN(eps=0.5) mixer = CustomCombination() paellaModel = Paella(regressor=LinearRegression, noise_label=None, max_it=10, regular_size=100, minimum_size=30, width_r=0.95, power=10, random_state=42) linear_model = LinearRegression() steps = [ ('Concatenate_Xy', concatenator), ('Gaussian_Mixture', gaussian_clustering), ('Dbscan', dbscan), ('Combine_Clustering', mixer), ('Paella', paellaModel), ('Regressor', linear_model), ] connections = { 'Concatenate_Xy': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), 'Combine_Clustering': dict(dominant=('Dbscan', 'predict'), other=('Gaussian_Mixture', 'predict')), 'Paella': dict(X='X', y='y', classification=('Combine_Clustering', 'predict')), 'Regressor': dict(X='X', y='y', sample_weight=('Paella', 'predict')) } self.steps = steps self.connections = connections self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
def test_Pipegraph__some_predict_connections(self): some_connections = { 'Concatenate_Xy': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), } pgraph = PipeGraph(steps=self.steps, fit_connections=self.connections, predict_connections=some_connections) pgraph.fit(self.X, self.y) predict_nodes_list = list(pgraph._filter_predict_nodes()) self.assertEqual( sorted(predict_nodes_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', ]))
def setUp(self): self.size = 1000 self.X = np.random.rand(self.size, 1) self.y = self.X * 2 sc = MinMaxScaler(feature_range=(0, 1)) lm = LinearRegression() steps = [('scaler', sc), ('linear_model', lm)] connections = { 'scaler': dict(X='X'), 'linear_model': dict(X=('scaler', 'predict'), y='y') } self.lm = lm self.sc = sc self.steps = steps self.connections = connections self.pgraph = PipeGraph(steps=steps, fit_connections=connections) self.param_grid = dict( linear_model__fit_intercept=[False, True], linear_model__normalize=[True, False], )
def setUp(self): self.size = 100 self.X = np.random.rand(self.size, 1) self.y = 2 * self.X sc = MinMaxScaler() gm = GaussianMixture(n_components=3) km = KMeans(n_clusters=4) steps = [('scaler', sc), ('gaussian', gm), ('kmeans', km)] connections_1 = {'scaler': dict(X='X'), 'gaussian': 'scaler'} connections_2 = {'scaler': dict(X='X'), 'kmeans': 'scaler'} self.sc = sc self.gm = gm self.km = km self.steps = steps self.connections_1 = connections_1 self.connections_2 = connections_2 self.pgraph = PipeGraph(steps=steps, fit_connections=connections_1) self.param_grid = dict(fit_connections=[connections_1, connections_2])
def setUp(self): X, y = datasets.make_blobs(n_samples=10000, n_features=5, centers=10) self.X, self.y = X, y clustering = KMeans(n_clusters=10) classification = LinearDiscriminantAnalysis() steps = [('clustering', clustering), ('classification', classification)] pgraph = PipeGraph(steps=steps) pgraph.inject(sink='clustering', sink_var='X', source='_External', source_var='X') pgraph.inject(sink='classification', sink_var='X', source='_External', source_var='X') pgraph.inject(sink='classification', sink_var='y', source='clustering', source_var='predict') self.pgraph = pgraph
connections = { 'scaler': { 'X': 'X' }, 'bundle': { 'X': 'scaler', 'y': 'y' }, 'neutral': { 'X': 'bundle', 'y': 'y' } } pgraph = PipeGraph(steps=steps, fit_connections=connections) ############################################################################################################## # Using GridSearchCV to find the best number of clusters and the best regressors from sklearn.model_selection import GridSearchCV param_grid = {'bundle__classifier__n_components': range(3, 10)} gs = GridSearchCV(estimator=pgraph, param_grid=param_grid, refit=True) gs.fit(X_train, y_train) y_pred = gs.predict(X_train) plt.scatter(X_train, y_train) plt.scatter(X_train, y_pred) print("Score:", gs.score(X_test, y_test)) print("bundle__classifier__n_components:", gs.best_estimator_.get_params()['bundle__classifier__n_components'])
'y': ('demux', 'y_1') }, 'lm_2': { 'X': ('demux', 'X_2'), 'y': ('demux', 'y_2') }, 'mux': { '0': 'lm_0', '1': 'lm_1', '2': 'lm_2', 'selection': 'selection' } } three_multiplexed_models = PipeGraph( steps=three_multiplexed_models_steps, fit_connections=three_multiplexed_models_connections) ######################################################################################################### # Now we can treat this PipeGraph as a reusable component and use it as a unitary step in another PipeGraph: scaler = MinMaxScaler() gaussian_mixture = GaussianMixture(n_components=3) models = three_multiplexed_models steps = [ ('scaler', scaler), ('classifier', gaussian_mixture), ('models', three_multiplexed_models), ] connections = {
def test_Pipegraph__External_step_name(self): pgraph = PipeGraph(steps=self.steps_external, fit_connections=self.connections_external) self.assertRaises(ValueError, pgraph.fit, self.X, self.y)
def test_Pipegraph__ex_3_inject(self): import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from pipegraph.base import PipeGraph from pipegraph.demo_blocks import CustomPower X = pd.DataFrame( dict(X=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), sample_weight=np.array([ 0.01, 0.95, 0.10, 0.95, 0.95, 0.10, 0.10, 0.95, 0.95, 0.95, 0.01 ]))) y = np.array([10, 4, 20, 16, 25, -60, 85, 64, 81, 100, 150]) scaler = MinMaxScaler() polynomial_features = PolynomialFeatures() linear_model = LinearRegression() custom_power = CustomPower() selector = ColumnSelector(mapping={ 'X': slice(0, 1), 'sample_weight': slice(1, 2) }) steps = [('selector', selector), ('custom_power', custom_power), ('scaler', scaler), ('polynomial_features', polynomial_features), ('linear_model', linear_model)] pgraph = PipeGraph(steps=steps) #PipeGraphRegressor self.assertTrue(pgraph.fit_connections is None) self.assertTrue(pgraph.predict_connections is None) (pgraph.inject( sink='selector', sink_var='X', source='_External', source_var='X').inject( 'custom_power', 'X', 'selector', 'sample_weight').inject('scaler', 'X', 'selector', 'X').inject( 'polynomial_features', 'X', 'scaler').inject( 'linear_model', 'X', 'polynomial_features').inject('linear_model', 'y', source_var='y').inject( 'linear_model', 'sample_weight', 'custom_power')) self.assertTrue(pgraph.fit_connections is not None) self.assertTrue(pgraph.predict_connections is not None) pgraph.fit(X, y) self.assertEqual( pgraph.fit_connections, { 'selector': { 'X': ('_External', 'X') }, 'custom_power': { 'X': ('selector', 'sample_weight') }, 'scaler': { 'X': ('selector', 'X') }, 'polynomial_features': { 'X': ('scaler', 'predict') }, 'linear_model': { 'X': ('polynomial_features', 'predict'), 'y': ('_External', 'y'), 'sample_weight': ('custom_power', 'predict') } }) self.assertEqual( pgraph.predict_connections, { 'selector': { 'X': ('_External', 'X') }, 'custom_power': { 'X': ('selector', 'sample_weight') }, 'scaler': { 'X': ('selector', 'X') }, 'polynomial_features': { 'X': ('scaler', 'predict') }, 'linear_model': { 'X': ('polynomial_features', 'predict'), 'y': ('_External', 'y'), 'sample_weight': ('custom_power', 'predict') } })
# Next we define the steps and we use :class:`PipeGraphRegressor` as estimator for :class:`GridSearchCV`. scaler = MinMaxScaler() polynomial_features = PolynomialFeatures() linear_model = LinearRegression() custom_power = CustomPower() selector = ColumnSelector(mapping={ 'X': slice(0, 1), 'sample_weight': slice(1, 2) }) steps = [('selector', selector), ('custom_power', custom_power), ('scaler', scaler), ('polynomial_features', polynomial_features), ('linear_model', linear_model)] pgraph = PipeGraph(steps=steps) (pgraph.inject( sink='selector', sink_var='X', source='_External', source_var='X').inject( 'custom_power', 'X', 'selector', 'sample_weight').inject('scaler', 'X', 'selector', 'X').inject( 'polynomial_features', 'X', 'scaler').inject( 'linear_model', 'X', 'polynomial_features').inject('linear_model', 'y', source_var='y').inject( 'linear_model', 'sample_weight', 'custom_power')) ###############################################################################