def test_Pipegraph__example_1_no_connections(self): import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LinearRegression from pipegraph import PipeGraph X = np.random.rand(100, 1) y = 4 * X + 0.5 * np.random.randn(100, 1) scaler = MinMaxScaler() linear_model = LinearRegression() steps = [('scaler', scaler), ('linear_model', linear_model)] pgraph = PipeGraph(steps=steps) self.assertTrue(pgraph.fit_connections is None) self.assertTrue(pgraph.predict_connections is None) pgraph.fit(X, y) y_pred = pgraph.predict(X) self.assertEqual(y_pred.shape[0], y.shape[0]) self.assertEqual( pgraph.fit_connections, dict(scaler={'X': 'X'}, linear_model={ 'X': ('scaler', 'predict'), 'y': 'y' })) self.assertEqual( pgraph.predict_connections, dict(scaler={'X': 'X'}, linear_model={ 'X': ('scaler', 'predict'), 'y': 'y' }))
def test_Pipegraph__filter_nodes_predict(self): alternative_connections = {'Regressor': dict(X='X', y='y')} pgraph = PipeGraph(steps=self.steps, fit_connections=self.connections, predict_connections=alternative_connections) pgraph.fit(self.X, self.y) predict_nodes = list(pgraph._filter_predict_nodes()) self.assertEqual(predict_nodes, ['Regressor'])
def test_compositable__isinstance(self): X = self.X y = self.y new_graph = PipeGraph(steps=[('pgraph', self.pgraph)]) self.assertEqual(new_graph.named_steps, {'pgraph': self.pgraph}) new_graph.fit(X, y) result = new_graph.predict(X)['predict'] expected = self.pgraph.predict(X)['predict'] self.assertEqual(result.shape[0], expected.shape[0])
def test_Pipegraph__predict_connections(self): pgraph = PipeGraph(self.steps, self.connections) pgraph.fit(self.X, self.y) predict_nodes_list = list(pgraph._filter_predict_nodes()) self.assertEqual( sorted(predict_nodes_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', 'Combine_Clustering', 'Regressor', ]))
def test_Pipegraph__some_predict_connections(self): some_connections = { 'Concatenate_Xy': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), } pgraph = PipeGraph(steps=self.steps, fit_connections=self.connections, predict_connections=some_connections) pgraph.fit(self.X, self.y) predict_nodes_list = list(pgraph._filter_predict_nodes()) self.assertEqual( sorted(predict_nodes_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', ]))
class TestTwoNodes(unittest.TestCase): def setUp(self): self.size = 1000 self.X = np.random.rand(self.size, 1) self.y = self.X * 2 sc = MinMaxScaler(feature_range=(0, 1)) lm = LinearRegression() steps = [('scaler', sc), ('linear_model', lm)] connections = { 'scaler': dict(X='X'), 'linear_model': dict(X=('scaler', 'predict'), y='y') } self.lm = lm self.sc = sc self.steps = steps self.connections = connections self.pgraph = PipeGraph(steps=steps, fit_connections=connections) self.param_grid = dict( linear_model__fit_intercept=[False, True], linear_model__normalize=[True, False], ) self.pgraph.fit(self.X, self.y) def test_TwoNodes_fit(self): pgraph = self.pgraph pgraph.fit(X=self.X, y=self.y) self.assertTrue(hasattr(pgraph._steps_dict['linear_model'], 'coef_')) self.assertTrue( 1.9 < pgraph._steps_dict['linear_model'].coef_[0][0] < 2.1) self.assertEqual(pgraph._fit_data['linear_model', 'predict'].shape[0], self.size) result = pgraph.predict(X=self.X)['predict'] self.assertEqual(pgraph._fit_data['linear_model', 'predict'].shape[0], self.size) self.assertEqual(result.shape[0], self.size)
class TestPipeGraphSingleNodeLinearModel(unittest.TestCase): def setUp(self): self.size = 100 self.X = np.random.rand(self.size, 1) self.y = 2 * self.X lm = LinearRegression() steps = [('linear_model', lm)] connections = {'linear_model': dict(X='X', y='y')} self.lm = lm self.steps = steps self.connections = connections self.pgraph = PipeGraph(steps=steps, fit_connections=connections) self.param_grid = dict(linear_model__fit_intercept=[False, True], linear_model__normalize=[True, False]) self.pgraph.fit(self.X, self.y) def test_single_node_fit(self): pgraph = self.pgraph pgraph.fit(X=self.X, y=self.y) self.assertTrue(hasattr(pgraph._steps_dict['linear_model'], 'coef_')) self.assertAlmostEqual(pgraph._steps_dict['linear_model'].coef_[0][0], 2) self.assertEqual(pgraph._fit_data['linear_model', 'predict'].shape[0], self.size) result = pgraph.predict(X=self.X)['predict'] self.assertEqual(pgraph._fit_data['linear_model', 'predict'].shape[0], self.size) self.assertEqual(result.shape[0], self.size) def test_get_params(self): pgraph = self.pgraph result = pgraph.get_params() expected = { 'linear_model': self.lm, 'linear_model__copy_X': True, 'linear_model__fit_intercept': True, 'linear_model__n_jobs': 1, 'linear_model__normalize': False, 'steps': self.steps } for item in expected: self.assertTrue(item in result) def test_set_params(self): pgraph = self.pgraph result_pre = pgraph.get_params() expected_pre = { 'linear_model': self.lm, 'linear_model__copy_X': True, 'linear_model__fit_intercept': True, 'linear_model__n_jobs': 1, 'linear_model__normalize': False, 'steps': self.steps } for item in expected_pre: self.assertTrue(item in result_pre) result_post = pgraph.set_params( linear_model__copy_X=False).get_params() expected_post = { 'linear_model': self.lm, 'linear_model__copy_X': False, 'linear_model__fit_intercept': True, 'linear_model__n_jobs': 1, 'linear_model__normalize': False, 'steps': self.steps } for item in expected_post: self.assertTrue(item in result_post)
class TestPipegraph(unittest.TestCase): def setUp(self): self.size = 1000 self.X = pd.DataFrame(dict(X=np.random.rand(self.size, ))) self.y = pd.DataFrame(dict(y=(np.random.rand(self.size, )))) concatenator = Concatenator() gaussian_clustering = GaussianMixture(n_components=3) dbscan = DBSCAN(eps=0.5) mixer = CustomCombination() linear_model = LinearRegression() steps = [ ('Concatenate_Xy', concatenator), ('Gaussian_Mixture', gaussian_clustering), ('Dbscan', dbscan), ('Combine_Clustering', mixer), ('Regressor', linear_model), ] connections = { 'Concatenate_Xy': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), 'Combine_Clustering': dict(dominant=('Dbscan', 'predict'), other=('Gaussian_Mixture', 'predict')), 'Regressor': dict(X='X', y='y') } self.steps_external = [ ('_External', concatenator), ('Gaussian_Mixture', gaussian_clustering), ('Dbscan', dbscan), ('Combine_Clustering', mixer), ('Regressor', linear_model), ] self.connections_external = { '_External': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), 'Combine_Clustering': dict(dominant=('Dbscan', 'predict'), other=('Gaussian_Mixture', 'predict')), 'Regressor': dict(X='X', y='y') } self.steps = steps self.connections = connections self.pgraph = PipeGraph(steps=steps, fit_connections=connections) self.pgraph.fit(self.X, self.y) def test_Pipegraph__External_step_name(self): pgraph = PipeGraph(steps=self.steps_external, fit_connections=self.connections_external) self.assertRaises(ValueError, pgraph.fit, self.X, self.y) def test_Pipegraph__example_1_no_connections(self): import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LinearRegression from pipegraph import PipeGraphRegressor X = np.random.rand(100, 1) y = 4 * X + 0.5 * np.random.randn(100, 1) scaler = MinMaxScaler() linear_model = LinearRegression() steps = [('scaler', scaler), ('linear_model', linear_model)] pgraph = PipeGraphRegressor(steps=steps) self.assertTrue(pgraph._pipegraph.fit_connections is None) self.assertTrue(pgraph._pipegraph.predict_connections is None) pgraph.fit(X, y) y_pred = pgraph.predict(X) self.assertEqual(y_pred.shape[0], y.shape[0]) self.assertEqual( pgraph._pipegraph.fit_connections, dict(scaler={'X': 'X'}, linear_model={ 'X': ('scaler', 'predict'), 'y': 'y' })) self.assertEqual( pgraph._pipegraph.predict_connections, dict(scaler={'X': 'X'}, linear_model={ 'X': ('scaler', 'predict'), 'y': 'y' })) def test_Pipegraph__ex_3_inject(self): import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from pipegraph.base import PipeGraphRegressor from pipegraph.demo_blocks import CustomPower X = pd.DataFrame( dict(X=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), sample_weight=np.array([ 0.01, 0.95, 0.10, 0.95, 0.95, 0.10, 0.10, 0.95, 0.95, 0.95, 0.01 ]))) y = np.array([10, 4, 20, 16, 25, -60, 85, 64, 81, 100, 150]) scaler = MinMaxScaler() polynomial_features = PolynomialFeatures() linear_model = LinearRegression() custom_power = CustomPower() selector = ColumnSelector(mapping={ 'X': slice(0, 1), 'sample_weight': slice(1, 2) }) steps = [('selector', selector), ('custom_power', custom_power), ('scaler', scaler), ('polynomial_features', polynomial_features), ('linear_model', linear_model)] pgraph = PipeGraphRegressor(steps=steps) self.assertTrue(pgraph._pipegraph.fit_connections is None) self.assertTrue(pgraph._pipegraph.predict_connections is None) (pgraph.inject( sink='selector', sink_var='X', source='_External', source_var='X').inject( 'custom_power', 'X', 'selector', 'sample_weight').inject('scaler', 'X', 'selector', 'X').inject( 'polynomial_features', 'X', 'scaler').inject( 'linear_model', 'X', 'polynomial_features').inject('linear_model', 'y', source_var='y').inject( 'linear_model', 'sample_weight', 'custom_power')) self.assertTrue(pgraph._pipegraph.fit_connections is not None) self.assertTrue(pgraph._pipegraph.predict_connections is not None) pgraph.fit(X, y) self.assertEqual( pgraph._pipegraph.fit_connections, { 'selector': { 'X': ('_External', 'X') }, 'custom_power': { 'X': ('selector', 'sample_weight') }, 'scaler': { 'X': ('selector', 'X') }, 'polynomial_features': { 'X': ('scaler', 'predict') }, 'linear_model': { 'X': ('polynomial_features', 'predict'), 'y': ('_External', 'y'), 'sample_weight': ('custom_power', 'predict') } }) self.assertEqual( pgraph._pipegraph.predict_connections, { 'selector': { 'X': ('_External', 'X') }, 'custom_power': { 'X': ('selector', 'sample_weight') }, 'scaler': { 'X': ('selector', 'X') }, 'polynomial_features': { 'X': ('scaler', 'predict') }, 'linear_model': { 'X': ('polynomial_features', 'predict'), 'y': ('_External', 'y'), 'sample_weight': ('custom_power', 'predict') } }) def test_Pipegraph__fit_connections(self): pgraph = PipeGraph(self.steps, self.connections) pgraph.fit(self.X, self.y) fit_nodes_list = list(pgraph._filter_fit_nodes()) self.assertEqual( sorted(fit_nodes_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', 'Combine_Clustering', 'Regressor', ])) def test_Pipegraph__some_fit_connections(self): some_connections = { 'Concatenate_Xy': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), } pgraph = PipeGraph(steps=self.steps, fit_connections=some_connections, predict_connections=self.connections) pgraph.fit(self.X, self.y) fit_nodes_list = list(pgraph._filter_fit_nodes()) self.assertEqual( sorted(fit_nodes_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', ])) def test_Pipegraph__predict_connections(self): pgraph = PipeGraph(self.steps, self.connections) pgraph.fit(self.X, self.y) predict_nodes_list = list(pgraph._filter_predict_nodes()) self.assertEqual( sorted(predict_nodes_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', 'Combine_Clustering', 'Regressor', ])) def test_Pipegraph__some_predict_connections(self): some_connections = { 'Concatenate_Xy': dict(df1='X', df2='y'), 'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')), 'Dbscan': dict(X=('Concatenate_Xy', 'predict')), } pgraph = PipeGraph(steps=self.steps, fit_connections=self.connections, predict_connections=some_connections) pgraph.fit(self.X, self.y) predict_nodes_list = list(pgraph._filter_predict_nodes()) self.assertEqual( sorted(predict_nodes_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', ])) def test_Pipegraph__read_step_inputs_from_fit_data(self): pgraph = self.pgraph pgraph._fit_data = { ('_External', 'X'): self.X, ('_External', 'y'): self.y, ('Dbscan', 'predict'): self.y * 4 } result = pgraph._read_fit_signature_variables_from_graph_data( graph_data=pgraph._fit_data, step_name='Regressor') assert_array_equal(result['X'], self.X) assert_array_equal(result['y'], self.y) self.assertEqual(len(result), 2) def test_Pipegraph__read_predict_signature_variables_from_graph_data(self): pgraph = self.pgraph pgraph._predict_data = { ('_External', 'X'): self.X, ('_External', 'y'): self.y, ('Dbscan', 'predict'): self.y * 4 } result = pgraph._read_predict_signature_variables_from_graph_data( graph_data=pgraph._predict_data, step_name='Regressor') assert_array_equal(result['X'], self.X) self.assertEqual(len(result), 1) def test_Pipegraph__step__predict_lm(self): X = self.X y = self.y lm = LinearRegression() lm_step = add_mixins_to_step(lm) lm_step.pg_fit(X=X, y=y) assert_array_equal(lm.predict(X), lm_step.pg_predict(X=X)['predict']) def test_Pipegraph__under_fit__concatenate_Xy(self): pgraph = self.pgraph pgraph._fit_data = { ('_External', 'X'): self.X, ('_External', 'y'): self.y, ('Dbscan', 'predict'): self.y * 4, } expected = pd.concat([self.X, self.y], axis=1) pgraph._fit('Concatenate_Xy') self.assertEqual(expected.shape, pgraph._fit_data['Concatenate_Xy', 'predict'].shape) assert_frame_equal( self.X, pgraph._fit_data['Concatenate_Xy', 'predict'].loc[:, ['X']]) assert_frame_equal( self.y, pgraph._fit_data['Concatenate_Xy', 'predict'].loc[:, ['y']]) def test_Pipegraph__predict__concatenate_Xy(self): X = self.X y = self.y pgraph = self.pgraph expected = pd.concat([X, y], axis=1) current_step = pgraph._steps_dict['Concatenate_Xy'] current_step.pg_fit() result = current_step.pg_predict(df1=X, df2=y)['predict'] self.assertEqual(expected.shape, result.shape) assert_frame_equal(self.X, result.loc[:, ['X']]) assert_frame_equal(self.y, result.loc[:, ['y']]) assert_frame_equal(expected, result) def test_Pipegraph__predict__gaussian_mixture(self): X = self.X y = self.y pgraph = self.pgraph current_step = pgraph._steps_dict['Gaussian_Mixture'] current_step.pg_fit(X=X) expected = current_step.predict(X=X) result = current_step.pg_predict(X=X)['predict'] assert_array_equal(expected, result) def test_Pipegraph__predict__dbscan(self): X = self.X y = self.y pgraph = self.pgraph current_step = pgraph._steps_dict['Dbscan'] current_step.pg_fit(X=X) expected = current_step.fit_predict(X=X) result = current_step.pg_predict(X=X)['predict'] assert_array_equal(expected, result) def test_Pipegraph__combine_clustering_predict(self): X = self.X y = self.y pgraph = self.pgraph current_step = pgraph._steps_dict['Gaussian_Mixture'] current_step.pg_fit(X=pd.concat([X, y], axis=1)) result_gaussian = current_step.pg_predict( X=pd.concat([X, y], axis=1))['predict'] current_step = pgraph._steps_dict['Dbscan'] result_dbscan = current_step.pg_predict( X=pd.concat([X, y], axis=1))['predict'] self.assertEqual(result_dbscan.min(), 0) current_step = pgraph._steps_dict['Combine_Clustering'] current_step.pg_fit(dominant=result_dbscan, other=result_gaussian) expected = current_step.predict(dominant=result_dbscan, other=result_gaussian) result = current_step.pg_predict(dominant=result_dbscan, other=result_gaussian)['predict'] assert_array_equal(expected, result) self.assertEqual(result.min(), 0) def test_Pipegraph__strategy__dict_key(self): X = self.X y = self.y pgraph = self.pgraph current_step = pgraph._steps_dict['Concatenate_Xy'] current_step.pg_fit() result = current_step.pg_predict(df1=X, df2=y) self.assertEqual(list(result.keys()), ['predict']) def test_Pipegraph__dbscan__dict_key(self): X = self.X pgraph = self.pgraph current_step = pgraph._steps_dict['Dbscan'] current_step.pg_fit(X=X) result = current_step.pg_predict(X=X) self.assertEqual(list(result.keys()), ['predict']) def test_Pipegraph__combine_clustering__dict_key(self): X = self.X y = self.y pgraph = self.pgraph current_step = pgraph._steps_dict['Combine_Clustering'] current_step.pg_fit(dominant=X, other=y) result = current_step.pg_predict(dominant=X, other=y) self.assertEqual(list(result.keys()), ['predict']) def test_Pipegraph__gaussian_mixture__dict_key(self): X = self.X y = self.y pgraph = self.pgraph current_step = pgraph._steps_dict['Gaussian_Mixture'] current_step.pg_fit(X=X) result = current_step.pg_predict(X=X) self.assertEqual(sorted(list(result.keys())), sorted(['predict', 'predict_proba'])) def test_Pipegraph__regressor__dict_key(self): X = self.X y = self.y pgraph = self.pgraph current_step = pgraph._steps_dict['Regressor'] current_step.pg_fit(X=X, y=y) result = current_step.pg_predict(X=X) self.assertEqual(list(result.keys()), ['predict']) def test_Pipegraph__fit_node_names(self): pgraph = self.pgraph.fit(self.X, self.y) node_list = list(pgraph._fit_graph.nodes()) self.assertEqual( sorted(node_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', 'Combine_Clustering', 'Regressor', ])) def test_Pipegraph__predict_node_names(self): pgraph = self.pgraph.fit(self.X, self.y) node_list = list(pgraph._predict_graph.nodes()) self.assertEqual( sorted(node_list), sorted([ 'Concatenate_Xy', 'Gaussian_Mixture', 'Dbscan', 'Combine_Clustering', 'Regressor', ])) def test_Pipegraph__filter_nodes_fit(self): pgraph = self.pgraph.fit(self.X, self.y) fit_nodes = list(pgraph._filter_fit_nodes()) self.assertEqual( sorted(fit_nodes), sorted([ 'Concatenate_Xy', 'Dbscan', 'Gaussian_Mixture', 'Combine_Clustering', 'Regressor', ])) def test_Pipegraph__filter_nodes_predict(self): alternative_connections = {'Regressor': dict(X='X', y='y')} pgraph = PipeGraph(steps=self.steps, fit_connections=self.connections, predict_connections=alternative_connections) pgraph.fit(self.X, self.y) predict_nodes = list(pgraph._filter_predict_nodes()) self.assertEqual(predict_nodes, ['Regressor']) def test_Pipegraph__graph_fit_using_keywords(self): pgraph = self.pgraph pgraph.fit(X=self.X, y=self.y) assert_frame_equal(pgraph._fit_data['_External', 'X'], self.X) assert_frame_equal(pgraph._fit_data['_External', 'y'], self.y) self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].shape[0], self.y.shape[0]) self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Gaussian_Mixture', 'predict'].shape[0], self.y.shape[0]) self.assertEqual(pgraph._fit_data['Gaussian_Mixture', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].shape[0], self.y.shape[0]) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].max(), pgraph._fit_data['Gaussian_Mixture', 'predict'].max()) self.assertEqual(pgraph._fit_data['Regressor', 'predict'].shape[0], self.y.shape[0]) def test_Pipegraph__graph_fit_three_positional(self): pgraph = self.pgraph self.assertRaises(ValueError, pgraph.fit, self.X, self.y, self.y) def test_Pipegraph__graph_fit_two_positional(self): pgraph = self.pgraph pgraph.fit(self.X, self.y) assert_frame_equal(pgraph._fit_data['_External', 'X'], self.X) assert_frame_equal(pgraph._fit_data['_External', 'y'], self.y) self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].shape[0], self.y.shape[0]) self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Gaussian_Mixture', 'predict'].shape[0], self.y.shape[0]) self.assertEqual(pgraph._fit_data['Gaussian_Mixture', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].shape[0], self.y.shape[0]) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].max(), pgraph._fit_data['Gaussian_Mixture', 'predict'].max()) self.assertEqual(pgraph._fit_data['Regressor', 'predict'].shape[0], self.y.shape[0]) def test_Pipegraph__graph_fit_one_positional(self): pgraph = self.pgraph pgraph.fit(self.X, y=self.y) assert_frame_equal(pgraph._fit_data['_External', 'X'], self.X) assert_frame_equal(pgraph._fit_data['_External', 'y'], self.y) self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].shape[0], self.y.shape[0]) self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Gaussian_Mixture', 'predict'].shape[0], self.y.shape[0]) self.assertEqual(pgraph._fit_data['Gaussian_Mixture', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].shape[0], self.y.shape[0]) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].min(), 0) self.assertEqual( pgraph._fit_data['Combine_Clustering', 'predict'].max(), pgraph._fit_data['Gaussian_Mixture', 'predict'].max()) self.assertEqual(pgraph._fit_data['Regressor', 'predict'].shape[0], self.y.shape[0]) def test_Pipegraph__graph_predict_using_keywords(self): pgraph = self.pgraph pgraph.fit(X=self.X, y=self.y) pgraph.predict(X=self.X, y=self.y) assert_frame_equal(pgraph._predict_data['_External', 'X'], self.X) assert_frame_equal(pgraph._predict_data['_External', 'y'], self.y) self.assertEqual(pgraph._predict_data['Regressor', 'predict'].shape[0], self.y.shape[0]) def test_Pipegraph__graph_predict_using_three_positional(self): pgraph = self.pgraph pgraph.fit(X=self.X, y=self.y) self.assertRaises(ValueError, pgraph.predict, self.X, self.y, self.y) def test_Pipegraph__graph_predict_using_two_positional(self): pgraph = self.pgraph pgraph.fit(X=self.X, y=self.y) pgraph.predict(self.X, self.y) assert_frame_equal(pgraph._predict_data['_External', 'X'], self.X) assert_frame_equal(pgraph._predict_data['_External', 'y'], self.y) self.assertEqual(pgraph._predict_data['Regressor', 'predict'].shape[0], self.y.shape[0]) def test_Pipegraph__graph_predict_using_one_positional(self): pgraph = self.pgraph pgraph.fit(X=self.X, y=self.y) pgraph.predict(self.X, y=self.y) assert_frame_equal(pgraph._predict_data['_External', 'X'], self.X) assert_frame_equal(pgraph._predict_data['_External', 'y'], self.y) self.assertEqual(pgraph._predict_data['Regressor', 'predict'].shape[0], self.y.shape[0]) def test_Pipegraph__step_get_params_multiple(self): pgraph = self.pgraph self.assertEqual(pgraph._steps_dict['Concatenate_Xy'].get_params(), {}) self.assertEqual( pgraph._steps_dict['Gaussian_Mixture'].get_params() ['n_components'], 3) self.assertEqual(pgraph._steps_dict['Dbscan'].get_params()['eps'], 0.5) self.assertEqual(pgraph._steps_dict['Combine_Clustering'].get_params(), {}) def test_Pipegraph__step_set_params_multiple(self): pgraph = self.pgraph self.assertRaises(ValueError, pgraph._steps_dict['Concatenate_Xy'].set_params, ham=2, spam=9) self.assertEqual( pgraph._steps_dict['Gaussian_Mixture'].set_params( n_components=5).get_params()['n_components'], 5) self.assertEqual( pgraph._steps_dict['Dbscan'].set_params( eps=10.2).get_params()['eps'], 10.2) self.assertEqual( pgraph._steps_dict['Regressor'].set_params( copy_X=False).get_params(), { 'copy_X': False, 'fit_intercept': True, 'n_jobs': 1, 'normalize': False }) self.assertEqual( pgraph._steps_dict['Regressor'].set_params(n_jobs=13).get_params(), { 'copy_X': False, 'fit_intercept': True, 'n_jobs': 13, 'normalize': False }) def test_Pipegraph__named_steps(self): pgraph = self.pgraph self.assertEqual(pgraph.named_steps, dict(self.steps)) self.assertEqual(len(pgraph.named_steps), 5)
'X': 'scaler', 'y': 'y', 'selection': 'classifier' }, 'lm_0': { 'X': ('demux', 'X_0'), 'y': ('demux', 'y_0') }, 'lm_1': { 'X': ('demux', 'X_1'), 'y': ('demux', 'y_1') }, 'lm_2': { 'X': ('demux', 'X_2'), 'y': ('demux', 'y_2') }, 'mux': { '0': 'lm_0', '1': 'lm_1', '2': 'lm_2', 'selection': 'classifier' } } pgraph = PipeGraph(steps=steps, fit_connections=connections) pgraph.fit(X, y) #%% y_pred = pgraph.predict(X) plt.scatter(X, y) plt.scatter(X, y_pred)
def test_Pipegraph__ex_3_inject(self): import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from pipegraph.base import PipeGraph from pipegraph.demo_blocks import CustomPower X = pd.DataFrame( dict(X=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), sample_weight=np.array([ 0.01, 0.95, 0.10, 0.95, 0.95, 0.10, 0.10, 0.95, 0.95, 0.95, 0.01 ]))) y = np.array([10, 4, 20, 16, 25, -60, 85, 64, 81, 100, 150]) scaler = MinMaxScaler() polynomial_features = PolynomialFeatures() linear_model = LinearRegression() custom_power = CustomPower() selector = ColumnSelector(mapping={ 'X': slice(0, 1), 'sample_weight': slice(1, 2) }) steps = [('selector', selector), ('custom_power', custom_power), ('scaler', scaler), ('polynomial_features', polynomial_features), ('linear_model', linear_model)] pgraph = PipeGraph(steps=steps) #PipeGraphRegressor self.assertTrue(pgraph.fit_connections is None) self.assertTrue(pgraph.predict_connections is None) (pgraph.inject( sink='selector', sink_var='X', source='_External', source_var='X').inject( 'custom_power', 'X', 'selector', 'sample_weight').inject('scaler', 'X', 'selector', 'X').inject( 'polynomial_features', 'X', 'scaler').inject( 'linear_model', 'X', 'polynomial_features').inject('linear_model', 'y', source_var='y').inject( 'linear_model', 'sample_weight', 'custom_power')) self.assertTrue(pgraph.fit_connections is not None) self.assertTrue(pgraph.predict_connections is not None) pgraph.fit(X, y) self.assertEqual( pgraph.fit_connections, { 'selector': { 'X': ('_External', 'X') }, 'custom_power': { 'X': ('selector', 'sample_weight') }, 'scaler': { 'X': ('selector', 'X') }, 'polynomial_features': { 'X': ('scaler', 'predict') }, 'linear_model': { 'X': ('polynomial_features', 'predict'), 'y': ('_External', 'y'), 'sample_weight': ('custom_power', 'predict') } }) self.assertEqual( pgraph.predict_connections, { 'selector': { 'X': ('_External', 'X') }, 'custom_power': { 'X': ('selector', 'sample_weight') }, 'scaler': { 'X': ('selector', 'X') }, 'polynomial_features': { 'X': ('scaler', 'predict') }, 'linear_model': { 'X': ('polynomial_features', 'predict'), 'y': ('_External', 'y'), 'sample_weight': ('custom_power', 'predict') } })
class TestModelsWithDataDependentNumberOfReplicas(unittest.TestCase): def setUp(self): X_first = pd.Series(np.random.rand(1000, )) y_first = pd.Series(4 * X_first + 0.5 * np.random.randn(1000, )) X_second = pd.Series(np.random.rand(1000, ) + 3) y_second = pd.Series(-4 * X_second + 0.5 * np.random.randn(1000, )) X_third = pd.Series(np.random.rand(1000, ) + 6) y_third = pd.Series(2 * X_third + 0.5 * np.random.randn(1000, )) self.X = pd.concat([X_first, X_second, X_third], axis=0).to_frame() self.y = pd.concat([y_first, y_second, y_third], axis=0).to_frame() scaler = MinMaxScaler() gaussian_mixture = GaussianMixture(n_components=3) models = RegressorsWithDataDependentNumberOfReplicas(steps=[('regressor', LinearRegression())]) neutral_regressor = NeutralRegressor() steps = [('scaler', scaler), ('classifier', gaussian_mixture), ('models', models), ('neutral', neutral_regressor)] connections = {'scaler': {'X': 'X'}, 'classifier': {'X': 'scaler'}, 'models': {'X': 'scaler', 'y': 'y', 'selection': 'classifier'}, 'neutral': {'X': 'models'}, } self.pgraph = PipeGraph(steps=steps, fit_connections=connections) self.pgraph.fit(self.X, self.y) def test_ModelsWithDataDependentNumberOfReplicas__connections(self): X = self.X y = self.y pgraph = self.pgraph pgraph.fit(X, y) y_pred = pgraph.predict(X) self.assertTrue(isinstance(pgraph.named_steps['models'], RegressorsWithDataDependentNumberOfReplicas)) result_connections = pgraph.named_steps['models']._pipegraph.fit_connections expected_connections = {'regressorsBundle': {'X': 'X', 'selection': 'selection', 'y': 'y'}} self.assertEqual(result_connections, expected_connections) result_steps = sorted(list(pgraph.named_steps.keys())) expected_steps = sorted(['scaler', 'classifier', 'models', 'neutral']) self.assertEqual(result_steps, expected_steps) self.assertEqual(y_pred.shape[0], y.shape[0]) def test_ModelsWithDataDependentNumberOfReplicas__predict(self): X = self.X y = self.y pgraph = self.pgraph pgraph.fit(X, y) y_pred = pgraph.predict(X) self.assertEqual(y_pred.shape[0], y.shape[0]) def test_ModelsWithDataDependentNumberOfReplicas__score(self): X = self.X y = self.y pgraph = self.pgraph pgraph.fit(X, y) result = pgraph.score(X, y) self.assertTrue(result > -42 ) def test_ModelsWithDataDependentNumberOfReplicas__GridSearchCV(self): X = self.X y = self.y X_train, X_test, y_train, y_test = train_test_split(X, y) pgraph = self.pgraph param_grid = {'classifier__n_components': range(2, 10)} gs = GridSearchCV(estimator = pgraph, param_grid=param_grid, refit=True) gs.fit(X_train, y_train) result = gs.score(X_test, y_test) self.assertTrue(result > -42 )