Beispiel #1
0
    def test_Pipegraph__some_predict_connections(self):
        some_connections = {
            'Concatenate_Xy': dict(df1='X', df2='y'),
            'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan': dict(X=('Concatenate_Xy', 'predict')),
        }

        pgraph = PipeGraph(steps=self.steps,
                           fit_connections=self.connections,
                           predict_connections=some_connections)
        pgraph.fit(self.X, self.y)
        predict_nodes_list = list(pgraph._filter_predict_nodes())
        self.assertEqual(
            sorted(predict_nodes_list),
            sorted([
                'Concatenate_Xy',
                'Gaussian_Mixture',
                'Dbscan',
            ]))
Beispiel #2
0
    def setUp(self):
        self.size = 100
        self.X = pd.DataFrame(dict(X=np.random.rand(self.size, )))
        self.y = pd.DataFrame(dict(y=(np.random.rand(self.size, ))))
        sc = MinMaxScaler()
        lm = LinearRegression()
        neutral_regressor = NeutralRegressor()

        steps = [
            ('scaler', sc),
            ('model', lm),
        ]
        connections = {
            'scaler': {
                'X': 'X'
            },
            'model': {
                'X': ('scaler', 'predict'),
                'y': 'y'
            },
        }
        model = PipeGraph(steps, connections)

        steps = [('scaler', sc), ('model', lm), ('neutral', neutral_regressor)]
        connections = {
            'scaler': {
                'X': 'X'
            },
            'model': {
                'X': ('scaler', 'predict'),
                'y': 'y'
            },
            'neutral': {
                'X': 'model'
            }
        }

        model_custom = PipeGraph(steps, connections)

        self.sc = sc
        self.lm = lm
        self.model = model
        self.model_custom = model_custom
Beispiel #3
0
 def setUp(self):
     self.size = 1000
     self.X = np.random.rand(self.size, 1)
     self.y = self.X * 2
     sc = MinMaxScaler(feature_range=(0, 1))
     lm = LinearRegression()
     steps = [('scaler', sc), ('linear_model', lm)]
     connections = {
         'scaler': dict(X='X'),
         'linear_model': dict(X=('scaler', 'predict'), y='y')
     }
     self.lm = lm
     self.sc = sc
     self.steps = steps
     self.connections = connections
     self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
     self.param_grid = dict(
         linear_model__fit_intercept=[False, True],
         linear_model__normalize=[True, False],
     )
     self.pgraph.fit(self.X, self.y)
Beispiel #4
0
    def setUp(self):
        self.size = 100
        self.X = np.random.rand(self.size, 1)
        self.y = 2 * self.X

        lm = LinearRegression()
        steps = [('linear_model', lm)]
        connections = {'linear_model': dict(X='X', y='y')}
        self.lm = lm
        self.steps = steps
        self.connections = connections
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
        self.param_grid = dict(linear_model__fit_intercept=[False, True],
                               linear_model__normalize=[True, False])
Beispiel #5
0
class TestTwoNodes(unittest.TestCase):
    def setUp(self):
        self.size = 1000
        self.X = np.random.rand(self.size, 1)
        self.y = self.X * 2
        sc = MinMaxScaler(feature_range=(0, 1))
        lm = LinearRegression()
        steps = [('scaler', sc), ('linear_model', lm)]
        connections = {
            'scaler': dict(X='X'),
            'linear_model': dict(X=('scaler', 'predict'), y='y')
        }
        self.lm = lm
        self.sc = sc
        self.steps = steps
        self.connections = connections
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
        self.param_grid = dict(
            linear_model__fit_intercept=[False, True],
            linear_model__normalize=[True, False],
        )
        self.pgraph.fit(self.X, self.y)

    def test_TwoNodes_fit(self):
        pgraph = self.pgraph
        pgraph.fit(X=self.X, y=self.y)
        self.assertTrue(hasattr(pgraph._steps_dict['linear_model'], 'coef_'))
        self.assertTrue(
            1.9 < pgraph._steps_dict['linear_model'].coef_[0][0] < 2.1)

        self.assertEqual(pgraph._fit_data['linear_model', 'predict'].shape[0],
                         self.size)
        result = pgraph.predict(X=self.X)['predict']
        self.assertEqual(pgraph._fit_data['linear_model', 'predict'].shape[0],
                         self.size)
        self.assertEqual(result.shape[0], self.size)
Beispiel #6
0
    def setUp(self):
        self.size = 100
        self.X = pd.DataFrame(dict(X=np.random.rand(self.size, )))
        self.y = pd.DataFrame(dict(y=np.random.rand(self.size, )))
        concatenator = Concatenator()
        gaussian_clustering = GaussianMixture(n_components=3)
        dbscan = DBSCAN(eps=0.5)
        mixer = CustomCombination()
        paellaModel = Paella(regressor=LinearRegression,
                             noise_label=None,
                             max_it=10,
                             regular_size=100,
                             minimum_size=30,
                             width_r=0.95,
                             power=10,
                             random_state=42)
        linear_model = LinearRegression()
        steps = [
            ('Concatenate_Xy', concatenator),
            ('Gaussian_Mixture', gaussian_clustering),
            ('Dbscan', dbscan),
            ('Combine_Clustering', mixer),
            ('Paella', paellaModel),
            ('Regressor', linear_model),
        ]

        connections = {
            'Concatenate_Xy':
            dict(df1='X', df2='y'),
            'Gaussian_Mixture':
            dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan':
            dict(X=('Concatenate_Xy', 'predict')),
            'Combine_Clustering':
            dict(dominant=('Dbscan', 'predict'),
                 other=('Gaussian_Mixture', 'predict')),
            'Paella':
            dict(X='X',
                 y='y',
                 classification=('Combine_Clustering', 'predict')),
            'Regressor':
            dict(X='X', y='y', sample_weight=('Paella', 'predict'))
        }
        self.steps = steps
        self.connections = connections
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
Beispiel #7
0
class TestPipeGraphCompositable(unittest.TestCase):
    def setUp(self):
        self.size = 100
        self.X = np.random.rand(self.size, 1)
        self.y = 2 * self.X
        lm = LinearRegression()
        steps = [('linear_model', lm)]
        self.lm = lm
        self.steps = steps
        self.pgraph = PipeGraph(steps=steps)

    def test_compositable__isinstance(self):
        X = self.X
        y = self.y
        new_graph = PipeGraph(steps=[('pgraph', self.pgraph)])
        self.assertEqual(new_graph.named_steps, {'pgraph': self.pgraph})

        new_graph.fit(X, y)
        result = new_graph.predict(X)['predict']
        expected = self.pgraph.predict(X)['predict']
        self.assertEqual(result.shape[0], expected.shape[0])
Beispiel #8
0
    def setUp(self):
        X, y = datasets.make_blobs(n_samples=10000, n_features=5, centers=10)
        self.X, self.y = X, y
        clustering = KMeans(n_clusters=10)
        classification = LinearDiscriminantAnalysis()

        steps = [('clustering', clustering),
                 ('classification', classification)]

        pgraph = PipeGraph(steps=steps)
        pgraph.inject(sink='clustering',
                      sink_var='X',
                      source='_External',
                      source_var='X')
        pgraph.inject(sink='classification',
                      sink_var='X',
                      source='_External',
                      source_var='X')
        pgraph.inject(sink='classification',
                      sink_var='y',
                      source='clustering',
                      source_var='predict')
        self.pgraph = pgraph
Beispiel #9
0
    def setUp(self):
        self.size = 100
        self.X = np.random.rand(self.size, 1)
        self.y = 2 * self.X

        sc = MinMaxScaler()
        gm = GaussianMixture(n_components=3)
        km = KMeans(n_clusters=4)

        steps = [('scaler', sc), ('gaussian', gm), ('kmeans', km)]
        connections_1 = {'scaler': dict(X='X'), 'gaussian': 'scaler'}
        connections_2 = {'scaler': dict(X='X'), 'kmeans': 'scaler'}

        self.sc = sc
        self.gm = gm
        self.km = km

        self.steps = steps
        self.connections_1 = connections_1
        self.connections_2 = connections_2
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections_1)
        self.param_grid = dict(fit_connections=[connections_1, connections_2])
Beispiel #10
0
class TestPipeGraphSingleNodeLinearModel(unittest.TestCase):
    def setUp(self):
        self.size = 100
        self.X = np.random.rand(self.size, 1)
        self.y = 2 * self.X

        lm = LinearRegression()
        steps = [('linear_model', lm)]
        connections = {'linear_model': dict(X='X', y='y')}
        self.lm = lm
        self.steps = steps
        self.connections = connections
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
        self.param_grid = dict(linear_model__fit_intercept=[False, True],
                               linear_model__normalize=[True, False])
        self.pgraph.fit(self.X, self.y)

    def test_single_node_fit(self):
        pgraph = self.pgraph
        pgraph.fit(X=self.X, y=self.y)
        self.assertTrue(hasattr(pgraph._steps_dict['linear_model'], 'coef_'))
        self.assertAlmostEqual(pgraph._steps_dict['linear_model'].coef_[0][0],
                               2)

        self.assertEqual(pgraph._fit_data['linear_model', 'predict'].shape[0],
                         self.size)
        result = pgraph.predict(X=self.X)['predict']
        self.assertEqual(pgraph._fit_data['linear_model', 'predict'].shape[0],
                         self.size)
        self.assertEqual(result.shape[0], self.size)

    def test_get_params(self):
        pgraph = self.pgraph
        result = pgraph.get_params()
        expected = {
            'linear_model': self.lm,
            'linear_model__copy_X': True,
            'linear_model__fit_intercept': True,
            'linear_model__n_jobs': 1,
            'linear_model__normalize': False,
            'steps': self.steps
        }
        for item in expected:
            self.assertTrue(item in result)

    def test_set_params(self):
        pgraph = self.pgraph
        result_pre = pgraph.get_params()
        expected_pre = {
            'linear_model': self.lm,
            'linear_model__copy_X': True,
            'linear_model__fit_intercept': True,
            'linear_model__n_jobs': 1,
            'linear_model__normalize': False,
            'steps': self.steps
        }
        for item in expected_pre:
            self.assertTrue(item in result_pre)

        result_post = pgraph.set_params(
            linear_model__copy_X=False).get_params()
        expected_post = {
            'linear_model': self.lm,
            'linear_model__copy_X': False,
            'linear_model__fit_intercept': True,
            'linear_model__n_jobs': 1,
            'linear_model__normalize': False,
            'steps': self.steps
        }
        for item in expected_post:
            self.assertTrue(item in result_post)
Beispiel #11
0
 def test_Pipegraph__External_step_name(self):
     pgraph = PipeGraph(steps=self.steps_external,
                        fit_connections=self.connections_external)
     self.assertRaises(ValueError, pgraph.fit, self.X, self.y)
Beispiel #12
0
class TestPipegraph(unittest.TestCase):
    def setUp(self):
        self.size = 1000
        self.X = pd.DataFrame(dict(X=np.random.rand(self.size, )))
        self.y = pd.DataFrame(dict(y=(np.random.rand(self.size, ))))
        concatenator = Concatenator()
        gaussian_clustering = GaussianMixture(n_components=3)
        dbscan = DBSCAN(eps=0.5)
        mixer = CustomCombination()
        linear_model = LinearRegression()
        steps = [
            ('Concatenate_Xy', concatenator),
            ('Gaussian_Mixture', gaussian_clustering),
            ('Dbscan', dbscan),
            ('Combine_Clustering', mixer),
            ('Regressor', linear_model),
        ]

        connections = {
            'Concatenate_Xy':
            dict(df1='X', df2='y'),
            'Gaussian_Mixture':
            dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan':
            dict(X=('Concatenate_Xy', 'predict')),
            'Combine_Clustering':
            dict(dominant=('Dbscan', 'predict'),
                 other=('Gaussian_Mixture', 'predict')),
            'Regressor':
            dict(X='X', y='y')
        }

        self.steps_external = [
            ('_External', concatenator),
            ('Gaussian_Mixture', gaussian_clustering),
            ('Dbscan', dbscan),
            ('Combine_Clustering', mixer),
            ('Regressor', linear_model),
        ]

        self.connections_external = {
            '_External':
            dict(df1='X', df2='y'),
            'Gaussian_Mixture':
            dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan':
            dict(X=('Concatenate_Xy', 'predict')),
            'Combine_Clustering':
            dict(dominant=('Dbscan', 'predict'),
                 other=('Gaussian_Mixture', 'predict')),
            'Regressor':
            dict(X='X', y='y')
        }

        self.steps = steps
        self.connections = connections
        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
        self.pgraph.fit(self.X, self.y)

    def test_Pipegraph__External_step_name(self):
        pgraph = PipeGraph(steps=self.steps_external,
                           fit_connections=self.connections_external)
        self.assertRaises(ValueError, pgraph.fit, self.X, self.y)

    def test_Pipegraph__example_1_no_connections(self):
        import numpy as np
        from sklearn.preprocessing import MinMaxScaler
        from sklearn.linear_model import LinearRegression
        from pipegraph import PipeGraphRegressor

        X = np.random.rand(100, 1)
        y = 4 * X + 0.5 * np.random.randn(100, 1)

        scaler = MinMaxScaler()
        linear_model = LinearRegression()
        steps = [('scaler', scaler), ('linear_model', linear_model)]

        pgraph = PipeGraphRegressor(steps=steps)
        self.assertTrue(pgraph._pipegraph.fit_connections is None)
        self.assertTrue(pgraph._pipegraph.predict_connections is None)
        pgraph.fit(X, y)
        y_pred = pgraph.predict(X)
        self.assertEqual(y_pred.shape[0], y.shape[0])
        self.assertEqual(
            pgraph._pipegraph.fit_connections,
            dict(scaler={'X': 'X'},
                 linear_model={
                     'X': ('scaler', 'predict'),
                     'y': 'y'
                 }))
        self.assertEqual(
            pgraph._pipegraph.predict_connections,
            dict(scaler={'X': 'X'},
                 linear_model={
                     'X': ('scaler', 'predict'),
                     'y': 'y'
                 }))

    def test_Pipegraph__ex_3_inject(self):
        import numpy as np
        import pandas as pd
        from sklearn.preprocessing import MinMaxScaler
        from sklearn.preprocessing import PolynomialFeatures
        from sklearn.linear_model import LinearRegression
        from sklearn.model_selection import GridSearchCV
        from pipegraph.base import PipeGraphRegressor
        from pipegraph.demo_blocks import CustomPower

        X = pd.DataFrame(
            dict(X=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
                 sample_weight=np.array([
                     0.01, 0.95, 0.10, 0.95, 0.95, 0.10, 0.10, 0.95, 0.95,
                     0.95, 0.01
                 ])))
        y = np.array([10, 4, 20, 16, 25, -60, 85, 64, 81, 100, 150])

        scaler = MinMaxScaler()
        polynomial_features = PolynomialFeatures()
        linear_model = LinearRegression()
        custom_power = CustomPower()
        selector = ColumnSelector(mapping={
            'X': slice(0, 1),
            'sample_weight': slice(1, 2)
        })

        steps = [('selector', selector), ('custom_power', custom_power),
                 ('scaler', scaler),
                 ('polynomial_features', polynomial_features),
                 ('linear_model', linear_model)]

        pgraph = PipeGraphRegressor(steps=steps)

        self.assertTrue(pgraph._pipegraph.fit_connections is None)
        self.assertTrue(pgraph._pipegraph.predict_connections is None)

        (pgraph.inject(
            sink='selector',
            sink_var='X', source='_External', source_var='X').inject(
                'custom_power', 'X', 'selector',
                'sample_weight').inject('scaler', 'X', 'selector', 'X').inject(
                    'polynomial_features', 'X', 'scaler').inject(
                        'linear_model', 'X',
                        'polynomial_features').inject('linear_model',
                                                      'y',
                                                      source_var='y').inject(
                                                          'linear_model',
                                                          'sample_weight',
                                                          'custom_power'))

        self.assertTrue(pgraph._pipegraph.fit_connections is not None)
        self.assertTrue(pgraph._pipegraph.predict_connections is not None)
        pgraph.fit(X, y)
        self.assertEqual(
            pgraph._pipegraph.fit_connections, {
                'selector': {
                    'X': ('_External', 'X')
                },
                'custom_power': {
                    'X': ('selector', 'sample_weight')
                },
                'scaler': {
                    'X': ('selector', 'X')
                },
                'polynomial_features': {
                    'X': ('scaler', 'predict')
                },
                'linear_model': {
                    'X': ('polynomial_features', 'predict'),
                    'y': ('_External', 'y'),
                    'sample_weight': ('custom_power', 'predict')
                }
            })

        self.assertEqual(
            pgraph._pipegraph.predict_connections, {
                'selector': {
                    'X': ('_External', 'X')
                },
                'custom_power': {
                    'X': ('selector', 'sample_weight')
                },
                'scaler': {
                    'X': ('selector', 'X')
                },
                'polynomial_features': {
                    'X': ('scaler', 'predict')
                },
                'linear_model': {
                    'X': ('polynomial_features', 'predict'),
                    'y': ('_External', 'y'),
                    'sample_weight': ('custom_power', 'predict')
                }
            })

    def test_Pipegraph__fit_connections(self):
        pgraph = PipeGraph(self.steps, self.connections)
        pgraph.fit(self.X, self.y)
        fit_nodes_list = list(pgraph._filter_fit_nodes())
        self.assertEqual(
            sorted(fit_nodes_list),
            sorted([
                'Concatenate_Xy',
                'Gaussian_Mixture',
                'Dbscan',
                'Combine_Clustering',
                'Regressor',
            ]))

    def test_Pipegraph__some_fit_connections(self):
        some_connections = {
            'Concatenate_Xy': dict(df1='X', df2='y'),
            'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan': dict(X=('Concatenate_Xy', 'predict')),
        }

        pgraph = PipeGraph(steps=self.steps,
                           fit_connections=some_connections,
                           predict_connections=self.connections)
        pgraph.fit(self.X, self.y)

        fit_nodes_list = list(pgraph._filter_fit_nodes())
        self.assertEqual(
            sorted(fit_nodes_list),
            sorted([
                'Concatenate_Xy',
                'Gaussian_Mixture',
                'Dbscan',
            ]))

    def test_Pipegraph__predict_connections(self):
        pgraph = PipeGraph(self.steps, self.connections)
        pgraph.fit(self.X, self.y)
        predict_nodes_list = list(pgraph._filter_predict_nodes())
        self.assertEqual(
            sorted(predict_nodes_list),
            sorted([
                'Concatenate_Xy',
                'Gaussian_Mixture',
                'Dbscan',
                'Combine_Clustering',
                'Regressor',
            ]))

    def test_Pipegraph__some_predict_connections(self):
        some_connections = {
            'Concatenate_Xy': dict(df1='X', df2='y'),
            'Gaussian_Mixture': dict(X=('Concatenate_Xy', 'predict')),
            'Dbscan': dict(X=('Concatenate_Xy', 'predict')),
        }

        pgraph = PipeGraph(steps=self.steps,
                           fit_connections=self.connections,
                           predict_connections=some_connections)
        pgraph.fit(self.X, self.y)
        predict_nodes_list = list(pgraph._filter_predict_nodes())
        self.assertEqual(
            sorted(predict_nodes_list),
            sorted([
                'Concatenate_Xy',
                'Gaussian_Mixture',
                'Dbscan',
            ]))

    def test_Pipegraph__read_step_inputs_from_fit_data(self):
        pgraph = self.pgraph
        pgraph._fit_data = {
            ('_External', 'X'): self.X,
            ('_External', 'y'): self.y,
            ('Dbscan', 'predict'): self.y * 4
        }

        result = pgraph._read_fit_signature_variables_from_graph_data(
            graph_data=pgraph._fit_data, step_name='Regressor')
        assert_array_equal(result['X'], self.X)
        assert_array_equal(result['y'], self.y)
        self.assertEqual(len(result), 2)

    def test_Pipegraph__read_predict_signature_variables_from_graph_data(self):
        pgraph = self.pgraph
        pgraph._predict_data = {
            ('_External', 'X'): self.X,
            ('_External', 'y'): self.y,
            ('Dbscan', 'predict'): self.y * 4
        }

        result = pgraph._read_predict_signature_variables_from_graph_data(
            graph_data=pgraph._predict_data, step_name='Regressor')
        assert_array_equal(result['X'], self.X)
        self.assertEqual(len(result), 1)

    def test_Pipegraph__step__predict_lm(self):
        X = self.X
        y = self.y
        lm = LinearRegression()
        lm_step = add_mixins_to_step(lm)
        lm_step.pg_fit(X=X, y=y)
        assert_array_equal(lm.predict(X), lm_step.pg_predict(X=X)['predict'])

    def test_Pipegraph__under_fit__concatenate_Xy(self):
        pgraph = self.pgraph
        pgraph._fit_data = {
            ('_External', 'X'): self.X,
            ('_External', 'y'): self.y,
            ('Dbscan', 'predict'): self.y * 4,
        }
        expected = pd.concat([self.X, self.y], axis=1)
        pgraph._fit('Concatenate_Xy')
        self.assertEqual(expected.shape, pgraph._fit_data['Concatenate_Xy',
                                                          'predict'].shape)
        assert_frame_equal(
            self.X, pgraph._fit_data['Concatenate_Xy', 'predict'].loc[:,
                                                                      ['X']])
        assert_frame_equal(
            self.y, pgraph._fit_data['Concatenate_Xy', 'predict'].loc[:,
                                                                      ['y']])

    def test_Pipegraph__predict__concatenate_Xy(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph
        expected = pd.concat([X, y], axis=1)
        current_step = pgraph._steps_dict['Concatenate_Xy']
        current_step.pg_fit()
        result = current_step.pg_predict(df1=X, df2=y)['predict']
        self.assertEqual(expected.shape, result.shape)
        assert_frame_equal(self.X, result.loc[:, ['X']])
        assert_frame_equal(self.y, result.loc[:, ['y']])
        assert_frame_equal(expected, result)

    def test_Pipegraph__predict__gaussian_mixture(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph
        current_step = pgraph._steps_dict['Gaussian_Mixture']
        current_step.pg_fit(X=X)
        expected = current_step.predict(X=X)
        result = current_step.pg_predict(X=X)['predict']
        assert_array_equal(expected, result)

    def test_Pipegraph__predict__dbscan(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph
        current_step = pgraph._steps_dict['Dbscan']
        current_step.pg_fit(X=X)
        expected = current_step.fit_predict(X=X)
        result = current_step.pg_predict(X=X)['predict']
        assert_array_equal(expected, result)

    def test_Pipegraph__combine_clustering_predict(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph
        current_step = pgraph._steps_dict['Gaussian_Mixture']
        current_step.pg_fit(X=pd.concat([X, y], axis=1))
        result_gaussian = current_step.pg_predict(
            X=pd.concat([X, y], axis=1))['predict']

        current_step = pgraph._steps_dict['Dbscan']
        result_dbscan = current_step.pg_predict(
            X=pd.concat([X, y], axis=1))['predict']
        self.assertEqual(result_dbscan.min(), 0)

        current_step = pgraph._steps_dict['Combine_Clustering']
        current_step.pg_fit(dominant=result_dbscan, other=result_gaussian)
        expected = current_step.predict(dominant=result_dbscan,
                                        other=result_gaussian)
        result = current_step.pg_predict(dominant=result_dbscan,
                                         other=result_gaussian)['predict']
        assert_array_equal(expected, result)
        self.assertEqual(result.min(), 0)

    def test_Pipegraph__strategy__dict_key(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph
        current_step = pgraph._steps_dict['Concatenate_Xy']
        current_step.pg_fit()
        result = current_step.pg_predict(df1=X, df2=y)
        self.assertEqual(list(result.keys()), ['predict'])

    def test_Pipegraph__dbscan__dict_key(self):
        X = self.X
        pgraph = self.pgraph
        current_step = pgraph._steps_dict['Dbscan']
        current_step.pg_fit(X=X)
        result = current_step.pg_predict(X=X)
        self.assertEqual(list(result.keys()), ['predict'])

    def test_Pipegraph__combine_clustering__dict_key(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph
        current_step = pgraph._steps_dict['Combine_Clustering']
        current_step.pg_fit(dominant=X, other=y)
        result = current_step.pg_predict(dominant=X, other=y)
        self.assertEqual(list(result.keys()), ['predict'])

    def test_Pipegraph__gaussian_mixture__dict_key(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph
        current_step = pgraph._steps_dict['Gaussian_Mixture']
        current_step.pg_fit(X=X)
        result = current_step.pg_predict(X=X)
        self.assertEqual(sorted(list(result.keys())),
                         sorted(['predict', 'predict_proba']))

    def test_Pipegraph__regressor__dict_key(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph
        current_step = pgraph._steps_dict['Regressor']
        current_step.pg_fit(X=X, y=y)
        result = current_step.pg_predict(X=X)
        self.assertEqual(list(result.keys()), ['predict'])

    def test_Pipegraph__fit_node_names(self):
        pgraph = self.pgraph.fit(self.X, self.y)
        node_list = list(pgraph._fit_graph.nodes())
        self.assertEqual(
            sorted(node_list),
            sorted([
                'Concatenate_Xy',
                'Gaussian_Mixture',
                'Dbscan',
                'Combine_Clustering',
                'Regressor',
            ]))

    def test_Pipegraph__predict_node_names(self):
        pgraph = self.pgraph.fit(self.X, self.y)
        node_list = list(pgraph._predict_graph.nodes())
        self.assertEqual(
            sorted(node_list),
            sorted([
                'Concatenate_Xy',
                'Gaussian_Mixture',
                'Dbscan',
                'Combine_Clustering',
                'Regressor',
            ]))

    def test_Pipegraph__filter_nodes_fit(self):
        pgraph = self.pgraph.fit(self.X, self.y)
        fit_nodes = list(pgraph._filter_fit_nodes())
        self.assertEqual(
            sorted(fit_nodes),
            sorted([
                'Concatenate_Xy',
                'Dbscan',
                'Gaussian_Mixture',
                'Combine_Clustering',
                'Regressor',
            ]))

    def test_Pipegraph__filter_nodes_predict(self):
        alternative_connections = {'Regressor': dict(X='X', y='y')}

        pgraph = PipeGraph(steps=self.steps,
                           fit_connections=self.connections,
                           predict_connections=alternative_connections)
        pgraph.fit(self.X, self.y)
        predict_nodes = list(pgraph._filter_predict_nodes())
        self.assertEqual(predict_nodes, ['Regressor'])

    def test_Pipegraph__graph_fit_using_keywords(self):
        pgraph = self.pgraph
        pgraph.fit(X=self.X, y=self.y)
        assert_frame_equal(pgraph._fit_data['_External', 'X'], self.X)
        assert_frame_equal(pgraph._fit_data['_External', 'y'], self.y)

        self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].shape[0],
                         self.y.shape[0])
        self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].min(), 0)

        self.assertEqual(
            pgraph._fit_data['Gaussian_Mixture', 'predict'].shape[0],
            self.y.shape[0])
        self.assertEqual(pgraph._fit_data['Gaussian_Mixture', 'predict'].min(),
                         0)

        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].shape[0],
            self.y.shape[0])
        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].min(), 0)
        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].max(),
            pgraph._fit_data['Gaussian_Mixture', 'predict'].max())

        self.assertEqual(pgraph._fit_data['Regressor', 'predict'].shape[0],
                         self.y.shape[0])

    def test_Pipegraph__graph_fit_three_positional(self):
        pgraph = self.pgraph
        self.assertRaises(ValueError, pgraph.fit, self.X, self.y, self.y)

    def test_Pipegraph__graph_fit_two_positional(self):
        pgraph = self.pgraph
        pgraph.fit(self.X, self.y)
        assert_frame_equal(pgraph._fit_data['_External', 'X'], self.X)
        assert_frame_equal(pgraph._fit_data['_External', 'y'], self.y)

        self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].shape[0],
                         self.y.shape[0])
        self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].min(), 0)

        self.assertEqual(
            pgraph._fit_data['Gaussian_Mixture', 'predict'].shape[0],
            self.y.shape[0])
        self.assertEqual(pgraph._fit_data['Gaussian_Mixture', 'predict'].min(),
                         0)

        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].shape[0],
            self.y.shape[0])
        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].min(), 0)
        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].max(),
            pgraph._fit_data['Gaussian_Mixture', 'predict'].max())

        self.assertEqual(pgraph._fit_data['Regressor', 'predict'].shape[0],
                         self.y.shape[0])

    def test_Pipegraph__graph_fit_one_positional(self):
        pgraph = self.pgraph
        pgraph.fit(self.X, y=self.y)
        assert_frame_equal(pgraph._fit_data['_External', 'X'], self.X)
        assert_frame_equal(pgraph._fit_data['_External', 'y'], self.y)

        self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].shape[0],
                         self.y.shape[0])
        self.assertEqual(pgraph._fit_data['Dbscan', 'predict'].min(), 0)

        self.assertEqual(
            pgraph._fit_data['Gaussian_Mixture', 'predict'].shape[0],
            self.y.shape[0])
        self.assertEqual(pgraph._fit_data['Gaussian_Mixture', 'predict'].min(),
                         0)

        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].shape[0],
            self.y.shape[0])
        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].min(), 0)
        self.assertEqual(
            pgraph._fit_data['Combine_Clustering', 'predict'].max(),
            pgraph._fit_data['Gaussian_Mixture', 'predict'].max())

        self.assertEqual(pgraph._fit_data['Regressor', 'predict'].shape[0],
                         self.y.shape[0])

    def test_Pipegraph__graph_predict_using_keywords(self):
        pgraph = self.pgraph
        pgraph.fit(X=self.X, y=self.y)
        pgraph.predict(X=self.X, y=self.y)
        assert_frame_equal(pgraph._predict_data['_External', 'X'], self.X)
        assert_frame_equal(pgraph._predict_data['_External', 'y'], self.y)
        self.assertEqual(pgraph._predict_data['Regressor', 'predict'].shape[0],
                         self.y.shape[0])

    def test_Pipegraph__graph_predict_using_three_positional(self):
        pgraph = self.pgraph
        pgraph.fit(X=self.X, y=self.y)
        self.assertRaises(ValueError, pgraph.predict, self.X, self.y, self.y)

    def test_Pipegraph__graph_predict_using_two_positional(self):
        pgraph = self.pgraph
        pgraph.fit(X=self.X, y=self.y)
        pgraph.predict(self.X, self.y)
        assert_frame_equal(pgraph._predict_data['_External', 'X'], self.X)
        assert_frame_equal(pgraph._predict_data['_External', 'y'], self.y)
        self.assertEqual(pgraph._predict_data['Regressor', 'predict'].shape[0],
                         self.y.shape[0])

    def test_Pipegraph__graph_predict_using_one_positional(self):
        pgraph = self.pgraph
        pgraph.fit(X=self.X, y=self.y)
        pgraph.predict(self.X, y=self.y)
        assert_frame_equal(pgraph._predict_data['_External', 'X'], self.X)
        assert_frame_equal(pgraph._predict_data['_External', 'y'], self.y)
        self.assertEqual(pgraph._predict_data['Regressor', 'predict'].shape[0],
                         self.y.shape[0])

    def test_Pipegraph__step_get_params_multiple(self):
        pgraph = self.pgraph
        self.assertEqual(pgraph._steps_dict['Concatenate_Xy'].get_params(), {})
        self.assertEqual(
            pgraph._steps_dict['Gaussian_Mixture'].get_params()
            ['n_components'], 3)
        self.assertEqual(pgraph._steps_dict['Dbscan'].get_params()['eps'], 0.5)
        self.assertEqual(pgraph._steps_dict['Combine_Clustering'].get_params(),
                         {})

    def test_Pipegraph__step_set_params_multiple(self):
        pgraph = self.pgraph
        self.assertRaises(ValueError,
                          pgraph._steps_dict['Concatenate_Xy'].set_params,
                          ham=2,
                          spam=9)
        self.assertEqual(
            pgraph._steps_dict['Gaussian_Mixture'].set_params(
                n_components=5).get_params()['n_components'], 5)
        self.assertEqual(
            pgraph._steps_dict['Dbscan'].set_params(
                eps=10.2).get_params()['eps'], 10.2)
        self.assertEqual(
            pgraph._steps_dict['Regressor'].set_params(
                copy_X=False).get_params(), {
                    'copy_X': False,
                    'fit_intercept': True,
                    'n_jobs': 1,
                    'normalize': False
                })
        self.assertEqual(
            pgraph._steps_dict['Regressor'].set_params(n_jobs=13).get_params(),
            {
                'copy_X': False,
                'fit_intercept': True,
                'n_jobs': 13,
                'normalize': False
            })

    def test_Pipegraph__named_steps(self):
        pgraph = self.pgraph
        self.assertEqual(pgraph.named_steps, dict(self.steps))
        self.assertEqual(len(pgraph.named_steps), 5)
###############################################################################
# Secondly, we define the steps and a ``param_grid`` dictionary as specified by :class:`GridSearchCV`.
# In this case we just want to explore a few possibilities varying the degree of the polynomials and whether to use or not an intercept at the linear model.

steps = [('scaler', scaler), ('polynomial_features', polynomial_features),
         ('linear_model', linear_model)]

param_grid = {
    'polynomial_features__degree': range(1, 11),
    'linear_model__fit_intercept': [True, False]
}

###############################################################################
# Now, we use ``PipeGraphRegressor`` as estimator for :class:`GridSearchCV` and perform the ``fit`` and ``predict`` operations.

pgraph = PipeGraph(steps=steps)
grid_search_regressor = GridSearchCV(estimator=pgraph,
                                     param_grid=param_grid,
                                     refit=True)
grid_search_regressor.fit(X, y)
y_pred = grid_search_regressor.predict(X)

plt.scatter(X, y)
plt.scatter(X, y_pred)
plt.show()

coef = grid_search_regressor.best_estimator_.get_params()['linear_model'].coef_
degree = grid_search_regressor.best_estimator_.get_params(
)['polynomial_features'].degree

print(
        'y': ('demux', 'y_1')
    },
    'lm_2': {
        'X': ('demux', 'X_2'),
        'y': ('demux', 'y_2')
    },
    'mux': {
        '0': 'lm_0',
        '1': 'lm_1',
        '2': 'lm_2',
        'selection': 'selection'
    }
}

three_multiplexed_models = PipeGraph(
    steps=three_multiplexed_models_steps,
    fit_connections=three_multiplexed_models_connections)

#########################################################################################################
#  Now we can treat this PipeGraph as a reusable component and use it as a unitary step in another PipeGraph:
scaler = MinMaxScaler()
gaussian_mixture = GaussianMixture(n_components=3)
models = three_multiplexed_models

steps = [
    ('scaler', scaler),
    ('classifier', gaussian_mixture),
    ('models', three_multiplexed_models),
]

connections = {
Beispiel #15
0
X = iris.data
y = iris.target

scaler = MinMaxScaler()
gaussian_nb = GaussianNB()
svc = SVC()
mlp = MLPClassifier()
concatenator = Concatenator()

steps = [('scaler', scaler), ('gaussian_nb', gaussian_nb), ('svc', svc),
         ('concat', concatenator), ('mlp', mlp)]

###############################################################################
# In this example we use a :class:`PipeGraphClassifier` because the result is a classification and we want to take advantage of Scikit-Learn default scoring method for classifiers.

pgraph = PipeGraph(steps=steps)
(pgraph.inject(sink='scaler', sink_var='X', source='_External',
               source_var='X').inject('gaussian_nb', 'X', 'scaler').inject(
                   'gaussian_nb', 'y',
                   source_var='y').inject('svc', 'X', 'scaler').inject(
                       'svc', 'y',
                       source_var='y').inject('concat', 'X1', 'scaler').inject(
                           'concat', 'X2',
                           'gaussian_nb').inject('concat', 'X3', 'svc').inject(
                               'mlp', 'X', 'concat').inject('mlp',
                                                            'y',
                                                            source_var='y'))

param_grid = {
    'svc__C': [0.1, 0.5, 1.0],
    'mlp__hidden_layer_sizes': [
Beispiel #16
0
class TestModelsWithDataDependentNumberOfReplicas(unittest.TestCase):
    def setUp(self):
        X_first = pd.Series(np.random.rand(1000, ))
        y_first = pd.Series(4 * X_first + 0.5 * np.random.randn(1000, ))

        X_second = pd.Series(np.random.rand(1000, ) + 3)
        y_second = pd.Series(-4 * X_second + 0.5 * np.random.randn(1000, ))

        X_third = pd.Series(np.random.rand(1000, ) + 6)
        y_third = pd.Series(2 * X_third + 0.5 * np.random.randn(1000, ))

        self.X = pd.concat([X_first, X_second, X_third], axis=0).to_frame()
        self.y = pd.concat([y_first, y_second, y_third], axis=0).to_frame()
        scaler = MinMaxScaler()
        gaussian_mixture = GaussianMixture(n_components=3)
        models = RegressorsWithDataDependentNumberOfReplicas(steps=[('regressor', LinearRegression())])
        neutral_regressor = NeutralRegressor()

        steps = [('scaler', scaler),
                 ('classifier', gaussian_mixture),
                 ('models', models),
                 ('neutral', neutral_regressor)]

        connections = {'scaler': {'X': 'X'},
                       'classifier': {'X': 'scaler'},
                       'models': {'X': 'scaler',
                                  'y': 'y',
                                  'selection': 'classifier'},
                       'neutral': {'X': 'models'},
                       }

        self.pgraph = PipeGraph(steps=steps, fit_connections=connections)
        self.pgraph.fit(self.X, self.y)

    def test_ModelsWithDataDependentNumberOfReplicas__connections(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph

        pgraph.fit(X, y)
        y_pred = pgraph.predict(X)

        self.assertTrue(isinstance(pgraph.named_steps['models'], RegressorsWithDataDependentNumberOfReplicas))
        result_connections = pgraph.named_steps['models']._pipegraph.fit_connections
        expected_connections = {'regressorsBundle': {'X': 'X', 'selection': 'selection', 'y': 'y'}}
        self.assertEqual(result_connections, expected_connections)
        result_steps = sorted(list(pgraph.named_steps.keys()))
        expected_steps = sorted(['scaler', 'classifier', 'models', 'neutral'])
        self.assertEqual(result_steps, expected_steps)
        self.assertEqual(y_pred.shape[0], y.shape[0])

    def test_ModelsWithDataDependentNumberOfReplicas__predict(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph

        pgraph.fit(X, y)
        y_pred = pgraph.predict(X)
        self.assertEqual(y_pred.shape[0], y.shape[0])

    def test_ModelsWithDataDependentNumberOfReplicas__score(self):
        X = self.X
        y = self.y
        pgraph = self.pgraph

        pgraph.fit(X, y)
        result = pgraph.score(X, y)
        self.assertTrue(result > -42 )

    def test_ModelsWithDataDependentNumberOfReplicas__GridSearchCV(self):
        X = self.X
        y = self.y

        X_train, X_test, y_train, y_test = train_test_split(X, y)

        pgraph = self.pgraph
        param_grid = {'classifier__n_components': range(2, 10)}
        gs = GridSearchCV(estimator = pgraph, param_grid=param_grid, refit=True)
        gs.fit(X_train, y_train)
        result = gs.score(X_test, y_test)
        self.assertTrue(result > -42 )
Beispiel #17
0
    def test_Pipegraph__ex_3_inject(self):
        import numpy as np
        import pandas as pd
        from sklearn.preprocessing import MinMaxScaler
        from sklearn.preprocessing import PolynomialFeatures
        from sklearn.linear_model import LinearRegression
        from sklearn.model_selection import GridSearchCV
        from pipegraph.base import PipeGraph
        from pipegraph.demo_blocks import CustomPower

        X = pd.DataFrame(
            dict(X=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
                 sample_weight=np.array([
                     0.01, 0.95, 0.10, 0.95, 0.95, 0.10, 0.10, 0.95, 0.95,
                     0.95, 0.01
                 ])))
        y = np.array([10, 4, 20, 16, 25, -60, 85, 64, 81, 100, 150])

        scaler = MinMaxScaler()
        polynomial_features = PolynomialFeatures()
        linear_model = LinearRegression()
        custom_power = CustomPower()
        selector = ColumnSelector(mapping={
            'X': slice(0, 1),
            'sample_weight': slice(1, 2)
        })

        steps = [('selector', selector), ('custom_power', custom_power),
                 ('scaler', scaler),
                 ('polynomial_features', polynomial_features),
                 ('linear_model', linear_model)]

        pgraph = PipeGraph(steps=steps)  #PipeGraphRegressor

        self.assertTrue(pgraph.fit_connections is None)
        self.assertTrue(pgraph.predict_connections is None)

        (pgraph.inject(
            sink='selector',
            sink_var='X', source='_External', source_var='X').inject(
                'custom_power', 'X', 'selector',
                'sample_weight').inject('scaler', 'X', 'selector', 'X').inject(
                    'polynomial_features', 'X', 'scaler').inject(
                        'linear_model', 'X',
                        'polynomial_features').inject('linear_model',
                                                      'y',
                                                      source_var='y').inject(
                                                          'linear_model',
                                                          'sample_weight',
                                                          'custom_power'))

        self.assertTrue(pgraph.fit_connections is not None)
        self.assertTrue(pgraph.predict_connections is not None)
        pgraph.fit(X, y)
        self.assertEqual(
            pgraph.fit_connections, {
                'selector': {
                    'X': ('_External', 'X')
                },
                'custom_power': {
                    'X': ('selector', 'sample_weight')
                },
                'scaler': {
                    'X': ('selector', 'X')
                },
                'polynomial_features': {
                    'X': ('scaler', 'predict')
                },
                'linear_model': {
                    'X': ('polynomial_features', 'predict'),
                    'y': ('_External', 'y'),
                    'sample_weight': ('custom_power', 'predict')
                }
            })

        self.assertEqual(
            pgraph.predict_connections, {
                'selector': {
                    'X': ('_External', 'X')
                },
                'custom_power': {
                    'X': ('selector', 'sample_weight')
                },
                'scaler': {
                    'X': ('selector', 'X')
                },
                'polynomial_features': {
                    'X': ('scaler', 'predict')
                },
                'linear_model': {
                    'X': ('polynomial_features', 'predict'),
                    'y': ('_External', 'y'),
                    'sample_weight': ('custom_power', 'predict')
                }
            })
        'X': 'scaler',
        'y': 'y',
        'selection': 'classifier'
    },
    'lm_0': {
        'X': ('demux', 'X_0'),
        'y': ('demux', 'y_0')
    },
    'lm_1': {
        'X': ('demux', 'X_1'),
        'y': ('demux', 'y_1')
    },
    'lm_2': {
        'X': ('demux', 'X_2'),
        'y': ('demux', 'y_2')
    },
    'mux': {
        '0': 'lm_0',
        '1': 'lm_1',
        '2': 'lm_2',
        'selection': 'classifier'
    }
}

pgraph = PipeGraph(steps=steps, fit_connections=connections)
pgraph.fit(X, y)
#%%
y_pred = pgraph.predict(X)
plt.scatter(X, y)
plt.scatter(X, y_pred)
# Next we define the steps and we use :class:`PipeGraphRegressor` as estimator for :class:`GridSearchCV`.

scaler = MinMaxScaler()
polynomial_features = PolynomialFeatures()
linear_model = LinearRegression()
custom_power = CustomPower()
selector = ColumnSelector(mapping={
    'X': slice(0, 1),
    'sample_weight': slice(1, 2)
})

steps = [('selector', selector), ('custom_power', custom_power),
         ('scaler', scaler), ('polynomial_features', polynomial_features),
         ('linear_model', linear_model)]

pgraph = PipeGraph(steps=steps)

(pgraph.inject(
    sink='selector', sink_var='X', source='_External', source_var='X').inject(
        'custom_power', 'X', 'selector',
        'sample_weight').inject('scaler', 'X', 'selector', 'X').inject(
            'polynomial_features', 'X', 'scaler').inject(
                'linear_model', 'X',
                'polynomial_features').inject('linear_model',
                                              'y',
                                              source_var='y').inject(
                                                  'linear_model',
                                                  'sample_weight',
                                                  'custom_power'))

###############################################################################
Beispiel #20
0
connections = {
    'scaler': {
        'X': 'X'
    },
    'bundle': {
        'X': 'scaler',
        'y': 'y'
    },
    'neutral': {
        'X': 'bundle',
        'y': 'y'
    }
}

pgraph = PipeGraph(steps=steps, fit_connections=connections)

##############################################################################################################
# Using GridSearchCV to find the best number of clusters and the best regressors

from sklearn.model_selection import GridSearchCV

param_grid = {'bundle__classifier__n_components': range(3, 10)}
gs = GridSearchCV(estimator=pgraph, param_grid=param_grid, refit=True)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_train)
plt.scatter(X_train, y_train)
plt.scatter(X_train, y_pred)
print("Score:", gs.score(X_test, y_test))
print("bundle__classifier__n_components:",
      gs.best_estimator_.get_params()['bundle__classifier__n_components'])
Beispiel #21
0
three_multiplexed_models_connections = {
                'demux': {'X': 'X',
                          'y': 'y',
                          'selection': 'selection'},
                'lm_0': {'X': ('demux', 'X_0'),
                         'y': ('demux', 'y_0')},
                'lm_1': {'X': ('demux', 'X_1'),
                         'y': ('demux', 'y_1')},
                'lm_2': {'X': ('demux', 'X_2'),
                         'y': ('demux', 'y_2')},
                'mux': {'0': 'lm_0',
                        '1': 'lm_1',
                        '2': 'lm_2',
                        'selection': 'selection'}}

three_multiplexed_models = PipeGraph(steps=three_multiplexed_models_steps,
                                     fit_connections=three_multiplexed_models_connections )

#########################################################################################################
#  Now we can treat this PipeGraph as a reusable component and use it as a unitary step in another PipeGraph:
scaler = MinMaxScaler()
gaussian_mixture = GaussianMixture(n_components=3)
models = three_multiplexed_models

steps = [('scaler', scaler),
         ('classifier', gaussian_mixture),
         ('models', three_multiplexed_models), ]

connections = {'scaler': {'X': 'X'},
               'classifier': {'X': 'scaler'},
               'models': {'X': 'scaler',
                          'y': 'y',