Example #1
0
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{
            'min_df': .5
        }, {
            'min_df': 2,
            'max_df': .9
        }, {
            'min_df': 1,
            'max_df': .6
        }, {
            'min_df': 2,
            'max_features': 3
        }]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X).toarray()
            result_dist = dist.fit_transform(X_rdd).toarray()

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local, result_dist)

            result_dist = dist.transform(X_rdd).toarray()
            assert_array_equal(result_local, result_dist)
    def test_same_output(self):
        X, X_rdd = self.generate_text_dataset()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
Example #4
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
Example #5
0
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Example #6
0
    def test_pipeline_init(self):
        # Test the various init parameters of the pipeline.
        assert_raises(TypeError, SparkPipeline)
        # Check that we can't instantiate pipelines with objects without fit
        # method
        pipe = assert_raises(TypeError, SparkPipeline, [('svc', IncorrectT)])
        # Smoke test with only an estimator
        clf = T()
        pipe = SparkPipeline([('svc', clf)])
        assert_equal(pipe.get_params(deep=True),
                     dict(svc__a=None, svc__b=None, svc=clf,
                          **pipe.get_params(deep=False)
                          ))

        # Check that params are set
        pipe.set_params(svc__a=0.1)
        assert_equal(clf.a, 0.1)
        assert_equal(clf.b, None)
        # Smoke test the repr:
        repr(pipe)

        # Test with two objects
        vect = SparkCountVectorizer()
        filter = SparkVarianceThreshold()
        pipe = SparkPipeline([('vect', vect), ('filter', filter)])

        # Check that we can't use the same stage name twice
        assert_raises(ValueError, SparkPipeline,
                      [('vect', vect), ('vect', vect)])

        # Check that params are set
        pipe.set_params(vect__min_df=0.1)
        assert_equal(vect.min_df, 0.1)
        # Smoke test the repr:
        repr(pipe)

        # Check that params are not set when naming them wrong
        assert_raises(ValueError, pipe.set_params, filter__min_df=0.1)

        # Test clone
        pipe2 = clone(pipe)
        assert_false(pipe.named_steps['vect'] is pipe2.named_steps['vect'])

        # Check that apart from estimators, the parameters are the same
        params = pipe.get_params(deep=True)
        params2 = pipe2.get_params(deep=True)

        for x in pipe.get_params(deep=False):
            params.pop(x)

        for x in pipe2.get_params(deep=False):
            params2.pop(x)

        # Remove estimators that where copied
        params.pop('vect')
        params.pop('filter')
        params2.pop('vect')
        params2.pop('filter')
        assert_equal(params, params2)
    def test_limit_features(self):
        X, X_rdd = self.generate_text_dataset()

        params = [{'min_df': .5},
                  {'min_df': 2, 'max_df': .9},
                  {'min_df': 1, 'max_df': .6},
                  {'min_df': 2, 'max_features': 3}]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X)
            result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local.toarray(), result_dist.toarray())

            result_dist = sp.vstack(dist.transform(X_rdd).collect())
            assert_array_equal(result_local.toarray(), result_dist.toarray())
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{'min_df': .5},
                  {'min_df': 2, 'max_df': .9},
                  {'min_df': 1, 'max_df': .6},
                  {'min_df': 2, 'max_features': 3}]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X).toarray()
            result_dist = dist.fit_transform(X_rdd).toarray()

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local, result_dist)

            result_dist = dist.transform(X_rdd).toarray()
            assert_array_equal(result_local, result_dist)
    def test_same_result_weight(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb",
                                         ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([("chars", loc_char), ("words", loc_word)],
                                 transformer_weights={"words": 10})
        dist_union = SparkFeatureUnion([("chars", dist_char),
                                        ("words", dist_word)],
                                       transformer_weights={"words": 10})

        loc_union.fit(X)
        dist_union.fit(Z)

        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Example #10
0
    def test_same_result_withdictrdd(self):
        X, X_rdd = self.make_text_rdd(2)
        Y_rdd = ArrayRDD(self.sc.parallelize([None] * len(X), 4), bsize=2)
        Z = DictRDD([X_rdd, Y_rdd], columns=("X", "y"), bsize=2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb",
                                         ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        loc_word_2 = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")
        dist_word_2 = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([("chars", loc_char), ("words", loc_word),
                                  ("words2", loc_word_2)])
        dist_union = SparkFeatureUnion([("chars", dist_char),
                                        ("words", dist_word),
                                        ("words2", dist_word_2)])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        converted_union = dist_union.to_scikit()

        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names(),
            converted_union.get_feature_names(),
        )

        # test same results
        Z_transformed = sp.vstack(dist_union.transform(Z)[:, 'X'].collect())
        assert_array_equal(
            loc_union.transform(X).toarray(), Z_transformed.toarray())
        assert_array_equal(
            loc_union.transform(X).toarray(),
            converted_union.transform(X).toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        X_converted_transformed = converted_union.fit_transform(X)
        Z_transformed = sp.vstack(
            dist_union.fit_transform(Z)[:, 'X'].collect())

        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           X_converted_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([("chars", loc_char),
                                      ("words", loc_word)],
                                     n_jobs=2)
        dist_union_par = SparkFeatureUnion([("chars", dist_char),
                                            ("words", dist_word)],
                                           n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        converted_union = dist_union_par.to_scikit()
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(
            dist_union_par.transform(Z)[:, 'X'].collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           converted_union.transform(X).toarray())