Esempio n. 1
0
    def test_params_are_forwarded(self):
        transformer1 = T()
        transformer2 = T()
        pipe = SparkFeatureUnion([('t1', transformer1), ('t2', transformer2)])

        print(pipe.get_params(deep=True))
        expected = dict(t1__a=None,
                        t1__b=None,
                        t2__a=None,
                        t2__b=None,
                        t1=transformer1,
                        t2=transformer2,
                        **pipe.get_params(deep=False))
        print(expected)
        assert_equal(pipe.get_params(deep=True), expected)

        # Check that params are set
        pipe.set_params(t1__a=0.1)
        assert_equal(transformer1.a, 0.1)
        assert_equal(transformer1.b, None)
        assert_equal(transformer2.a, None)
        assert_equal(transformer2.b, None)

        # Check that params are set
        _, _, Z = self.make_classification(2, 10000, 2000)
        pipe.fit(Z, t1__a=0.2, t2__a=0.3)
        assert_equal(transformer1.fit_params, {'a': 0.2})
        assert_equal(transformer2.fit_params, {'a': 0.3})
Esempio n. 2
0
    def test_params_are_forwarded(self):
        transformer1 = T()
        transformer2 = T()
        pipe = SparkFeatureUnion([('t1', transformer1),
                                  ('t2', transformer2)])

        print(pipe.get_params(deep=True))
        expected = dict(t1__a=None, t1__b=None,
                        t2__a=None, t2__b=None,
                        t1=transformer1, t2=transformer2,
                        **pipe.get_params(deep=False)
                                  )
        print(expected)
        assert_equal(pipe.get_params(deep=True), expected)

        # Check that params are set
        pipe.set_params(t1__a=0.1)
        assert_equal(transformer1.a, 0.1)
        assert_equal(transformer1.b, None)
        assert_equal(transformer2.a, None)
        assert_equal(transformer2.b, None)

        # Check that params are set
        _, _, Z = self.make_classification(2, 10000, 2000)
        pipe.fit(Z, t1__a=0.2, t2__a=0.3)
        assert_equal(transformer1.fit_params, {'a': 0.2})
        assert_equal(transformer2.fit_params, {'a': 0.3})
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Esempio n. 4
0
    def test_same_result_weight(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb",
                                         ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([("chars", loc_char), ("words", loc_word)],
                                 transformer_weights={"words": 10})
        dist_union = SparkFeatureUnion([("chars", dist_char),
                                        ("words", dist_word)],
                                       transformer_weights={"words": 10})

        loc_union.fit(X)
        dist_union.fit(Z)

        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
    def test_same_result_weight(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], transformer_weights={"words": 10})
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], transformer_weights={"words": 10})

        loc_union.fit(X)
        dist_union.fit(Z)

        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Esempio n. 6
0
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb",
                                         ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([("chars", loc_char), ("words", loc_word)])
        dist_union = SparkFeatureUnion([("chars", dist_char),
                                        ("words", dist_word)])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(loc_union.get_feature_names(),
                     dist_union.get_feature_names())
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([("chars", loc_char),
                                      ("words", loc_word)],
                                     n_jobs=2)
        dist_union_par = SparkFeatureUnion([("chars", dist_char),
                                            ("words", dist_word)],
                                           n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Esempio n. 7
0
    def test_same_result_withdictrdd(self):
        X, X_rdd = self.make_text_rdd(2)
        Y_rdd = ArrayRDD(self.sc.parallelize([None] * len(X), 4), bsize=2)
        Z = DictRDD([X_rdd, Y_rdd], columns=("X", "y"), bsize=2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb",
                                         ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        loc_word_2 = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")
        dist_word_2 = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([("chars", loc_char), ("words", loc_word),
                                  ("words2", loc_word_2)])
        dist_union = SparkFeatureUnion([("chars", dist_char),
                                        ("words", dist_word),
                                        ("words2", dist_word_2)])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        converted_union = dist_union.to_scikit()

        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names(),
            converted_union.get_feature_names(),
        )

        # test same results
        Z_transformed = sp.vstack(dist_union.transform(Z)[:, 'X'].collect())
        assert_array_equal(
            loc_union.transform(X).toarray(), Z_transformed.toarray())
        assert_array_equal(
            loc_union.transform(X).toarray(),
            converted_union.transform(X).toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        X_converted_transformed = converted_union.fit_transform(X)
        Z_transformed = sp.vstack(
            dist_union.fit_transform(Z)[:, 'X'].collect())

        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           X_converted_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([("chars", loc_char),
                                      ("words", loc_word)],
                                     n_jobs=2)
        dist_union_par = SparkFeatureUnion([("chars", dist_char),
                                            ("words", dist_word)],
                                           n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        converted_union = dist_union_par.to_scikit()
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(
            dist_union_par.transform(Z)[:, 'X'].collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           converted_union.transform(X).toarray())
Esempio n. 8
0
    def test_same_result_withdictrdd(self):
        X, X_rdd = self.make_text_rdd(2)
        Y_rdd = ArrayRDD(self.sc.parallelize([None] * len(X), 4), bsize=2)
        Z = DictRDD([X_rdd, Y_rdd], columns=("X", "y"), bsize=2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        loc_word_2 = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")
        dist_word_2 = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word),
            ("words2", loc_word_2)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word),
            ("words2", dist_word_2)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        converted_union = dist_union.to_scikit()

        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names(),
            converted_union.get_feature_names(),
        )

        # test same results
        Z_transformed = sp.vstack(dist_union.transform(Z)[:, 'X'].collect())
        assert_array_equal(loc_union.transform(X).toarray(), Z_transformed.toarray())
        assert_array_equal(loc_union.transform(X).toarray(),
                           converted_union.transform(X).toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        X_converted_transformed = converted_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z)[:, 'X'].collect())

        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           X_converted_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        converted_union = dist_union_par.to_scikit()
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z)[:, 'X'].collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           converted_union.transform(X).toarray())