Beispiel #1
0
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{
            'min_df': .5
        }, {
            'min_df': 2,
            'max_df': .9
        }, {
            'min_df': 1,
            'max_df': .6
        }, {
            'min_df': 2,
            'max_features': 3
        }]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X).toarray()
            result_dist = dist.fit_transform(X_rdd).toarray()

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local, result_dist)

            result_dist = dist.transform(X_rdd).toarray()
            assert_array_equal(result_local, result_dist)
    def test_same_output(self):
        X, X_rdd = self.generate_text_dataset()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
Beispiel #3
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
    def test_limit_features(self):
        X, X_rdd = self.generate_text_dataset()

        params = [{'min_df': .5},
                  {'min_df': 2, 'max_df': .9},
                  {'min_df': 1, 'max_df': .6},
                  {'min_df': 2, 'max_features': 3}]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X)
            result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local.toarray(), result_dist.toarray())

            result_dist = sp.vstack(dist.transform(X_rdd).collect())
            assert_array_equal(result_local.toarray(), result_dist.toarray())
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{'min_df': .5},
                  {'min_df': 2, 'max_df': .9},
                  {'min_df': 1, 'max_df': .6},
                  {'min_df': 2, 'max_features': 3}]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X).toarray()
            result_dist = dist.fit_transform(X_rdd).toarray()

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local, result_dist)

            result_dist = dist.transform(X_rdd).toarray()
            assert_array_equal(result_local, result_dist)