def test_pipeline_same_results(self):
        X, y, Z = self.make_classification(2, 10000, 2000)

        loc_clf = LogisticRegression()
        loc_filter = VarianceThreshold()
        loc_pipe = Pipeline([
            ('threshold', loc_filter),
            ('logistic', loc_clf)
        ])

        dist_clf = SparkLogisticRegression()
        dist_filter = SparkVarianceThreshold()
        dist_pipe = SparkPipeline([
            ('threshold', dist_filter),
            ('logistic', dist_clf)
        ])

        dist_filter.fit(Z)
        loc_pipe.fit(X, y)
        dist_pipe.fit(Z, logistic__classes=np.unique(y))

        assert_true(np.mean(np.abs(
            loc_pipe.predict(X) -
            np.concatenate(dist_pipe.predict(Z[:, 'X']).collect())
        )) < 0.1)
    def test_pipeline_same_results(self):
        X, y, Z = self.make_classification(2, 10000, 2000)

        loc_clf = LogisticRegression()
        loc_filter = VarianceThreshold()
        loc_pipe = Pipeline([
            ('threshold', loc_filter),
            ('logistic', loc_clf)
        ])

        dist_clf = SparkLogisticRegression()
        dist_filter = SparkVarianceThreshold()
        dist_pipe = SparkPipeline([
            ('threshold', dist_filter),
            ('logistic', dist_clf)
        ])

        dist_filter.fit(Z)
        loc_pipe.fit(X, y)
        dist_pipe.fit(Z, logistic__classes=np.unique(y))

        assert_true(np.mean(np.abs(
            loc_pipe.predict(X) -
            np.concatenate(dist_pipe.predict(Z[:, 'X']).collect())
        )) < 0.1)
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X, X_rdd = self.generate_dataset()
        result_local = local.fit_transform(X)
        result_dist = np.vstack(dist.fit_transform(X_rdd).collect())
        assert_array_almost_equal(result_local, result_dist)

        X, X_rdd = self.generate_sparse_dataset()
        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
    def test_pipeline_init(self):
        # Test the various init parameters of the pipeline.
        assert_raises(TypeError, SparkPipeline)
        # Check that we can't instantiate pipelines with objects without fit
        # method
        pipe = assert_raises(TypeError, SparkPipeline, [('svc', IncorrectT)])
        # Smoke test with only an estimator
        clf = T()
        pipe = SparkPipeline([('svc', clf)])
        assert_equal(pipe.get_params(deep=True),
                     dict(svc__a=None, svc__b=None, svc=clf,
                          **pipe.get_params(deep=False)
                          ))

        # Check that params are set
        pipe.set_params(svc__a=0.1)
        assert_equal(clf.a, 0.1)
        assert_equal(clf.b, None)
        # Smoke test the repr:
        repr(pipe)

        # Test with two objects
        vect = SparkCountVectorizer()
        filter = SparkVarianceThreshold()
        pipe = SparkPipeline([('vect', vect), ('filter', filter)])

        # Check that we can't use the same stage name twice
        assert_raises(ValueError, SparkPipeline,
                      [('vect', vect), ('vect', vect)])

        # Check that params are set
        pipe.set_params(vect__min_df=0.1)
        assert_equal(vect.min_df, 0.1)
        # Smoke test the repr:
        repr(pipe)

        # Check that params are not set when naming them wrong
        assert_raises(ValueError, pipe.set_params, filter__min_df=0.1)

        # Test clone
        pipe2 = clone(pipe)
        assert_false(pipe.named_steps['vect'] is pipe2.named_steps['vect'])

        # Check that apart from estimators, the parameters are the same
        params = pipe.get_params(deep=True)
        params2 = pipe2.get_params(deep=True)

        for x in pipe.get_params(deep=False):
            params.pop(x)

        for x in pipe2.get_params(deep=False):
            params2.pop(x)

        # Remove estimators that where copied
        params.pop('vect')
        params.pop('filter')
        params2.pop('vect')
        params2.pop('filter')
        assert_equal(params, params2)
    def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None),
                  ((1e3, 20), None),
                  ((1e3, 20), 100),
                  ((1e4, 100), None),
                  ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X_dense, X_dense_rdd = self.make_dense_rdd()
            X_sparse, X_sparse_rdd = self.make_sparse_rdd()
            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

            local.fit(X_dense)
            dist.fit(X_dense_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            local.fit(X_sparse)
            dist.fit(X_sparse_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            dist.fit(Z)
            assert_array_almost_equal(local.variances_, dist.variances_)
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = np.vstack(dist.fit_transform(X_dense_rdd).collect())
        assert_array_almost_equal(result_local, result_dist)

        result_local = local.fit_transform(X_sparse)
        result_dist = sp.vstack(dist.fit_transform(X_sparse_rdd).collect())
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
        result_dist = sp.vstack(dist.fit_transform(Z_rdd)[:, 'X'].collect())
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
    def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None),
                  ((1e3, 20), None),
                  ((1e3, 20), 100),
                  ((1e4, 100), None),
                  ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X, X_rdd = self.generate_dataset(shape, block_size)
            local.fit(X)
            dist.fit(X_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            X, X_rdd = self.generate_sparse_dataset()
            local.fit(X)
            dist.fit(X_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = dist.fit_transform(X_dense_rdd)
        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
        assert_array_almost_equal(result_local, result_dist.toarray())

        result_local = local.fit_transform(X_sparse)
        result_dist = dist.fit_transform(X_sparse_rdd)
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())

        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
Exemple #9
0
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = dist.fit_transform(X_dense_rdd)
        assert_true(check_rdd_dtype(result_dist, (np.ndarray, )))
        assert_array_almost_equal(result_local, result_dist.toarray())

        result_local = local.fit_transform(X_sparse)
        result_dist = dist.fit_transform(X_sparse_rdd)
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())

        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
Exemple #10
0
    def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100),
                  ((1e4, 100), None), ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X_dense, X_dense_rdd = self.make_dense_rdd()
            X_sparse, X_sparse_rdd = self.make_sparse_rdd()
            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

            local.fit(X_dense)
            dist.fit(X_dense_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            local.fit(X_sparse)
            dist.fit(X_sparse_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            dist.fit(Z)
            assert_array_almost_equal(local.variances_, dist.variances_)