コード例 #1
0
    def test_get_multiple_tuples(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)),
                    (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))]
        assert_multiple_tuples_equal(z[:2].collect(), expected)
        assert_multiple_tuples_equal(z[:2, :].collect(), expected)
        assert_multiple_tuples_equal(z[[0, 1]].collect(), expected)
        assert_multiple_tuples_equal(z[[0, 1], :].collect(), expected)
        assert_multiple_tuples_equal(z[[1, 0]].collect(), expected[::-1])

        expected = [(np.arange(50, 60).reshape((5, 2)), np.arange(25, 30)),
                    (np.arange(60, 70).reshape((5, 2)), np.arange(30, 35)),
                    (np.arange(70, 80).reshape((5, 2)), np.arange(35, 40))]
        assert_multiple_tuples_equal(z[-3:].collect(), expected)
        assert_multiple_tuples_equal(z[-3:, :].collect(), expected)
        assert_multiple_tuples_equal(z[[5, 6, 7]].collect(), expected)
        assert_multiple_tuples_equal(z[[5, 6, 7], :].collect(), expected)
        assert_multiple_tuples_equal(z[[7, 6, 5]].collect(), expected[::-1])
        assert_multiple_tuples_equal(z[[7, 6, 5], :].collect(), expected[::-1])
        assert_multiple_tuples_equal(z[[5, 7, 6]].collect(),
                                     [expected[0], expected[2], expected[1]])
コード例 #2
0
    def test_check_rdd_dtype(self):
        array = np.ndarray
        spmat = sp.spmatrix

        dense, dense_rdd = self.make_dense_rdd(block_size=5)
        sparse, sparse_rdd = self.make_sparse_rdd(block_size=5)

        dict_rdd = DictRDD(
            (dense_rdd, sparse_rdd),
            columns=('X', 'y'),
            bsize=5
        )

        assert_true(check_rdd_dtype(dense_rdd, array))
        assert_true(check_rdd_dtype(dense_rdd, (array, spmat)))
        assert_true(check_rdd_dtype(sparse_rdd, spmat))
        assert_true(check_rdd_dtype(dict_rdd, {'X': array}))
        assert_true(check_rdd_dtype(dict_rdd, {'y': spmat}))
        assert_true(check_rdd_dtype(dict_rdd, {'X': array, 'y': spmat}))
        assert_true(check_rdd_dtype(dict_rdd, {'X': (array, spmat), 'y': spmat}))

        assert_false(check_rdd_dtype(dense_rdd, spmat))
        assert_false(check_rdd_dtype(sparse_rdd, (array,)))
        assert_false(check_rdd_dtype(dict_rdd, {'X': spmat}))

        assert_raises(TypeError, check_rdd_dtype, (dict_rdd, (tuple,)))
        assert_raises(TypeError, check_rdd_dtype, (np.arange(20), (array,)))
コード例 #3
0
    def test_transform_with_dtype(self):
        data1 = np.arange(400).reshape((100, 4))
        data2 = np.arange(200).reshape((100, 2))
        rdd1 = self.sc.parallelize(data1, 4)
        rdd2 = self.sc.parallelize(data2, 4)

        X = DictRDD(rdd1.zip(rdd2), bsize=5)

        X2 = X.transform(lambda x: x**2, column=0)
        assert_equal(X2.dtype, (np.ndarray, np.ndarray))

        X2 = X.transform(lambda x: tuple((x**2).tolist()),
                         column=0,
                         dtype=tuple)
        assert_equal(X2.dtype, (tuple, np.ndarray))
        assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray}))

        X2 = X.transform(lambda x: x**2, column=1, dtype=list)
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda a, b: (a**2, (b**0.5).tolist()),
                         column=[0, 1],
                         dtype=(np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda b, a: ((b**0.5).tolist(), a**2),
                         column=[1, 0],
                         dtype=(list, np.ndarray))
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
コード例 #4
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_get_single_tuple(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        expected = np.arange(0, 10).reshape((5, 2)), np.arange(5)
        for tpl in [z.first(), z[0].first(), z[0].first()]:
            assert_tuple_equal(tpl, expected)

        expected = np.arange(30, 40).reshape((5, 2)), np.arange(15, 20)
        for tpl in [z[3].first(), z[3].first(), z[-5].first()]:
            assert_tuple_equal(tpl, expected)

        expected = np.arange(70, 80).reshape((5, 2)), np.arange(35, 40)
        for tpl in [z[7].first(), z[7].first(), z[-1].first()]:
            assert_tuple_equal(tpl, expected)
コード例 #5
0
    def test_get_single_tuple(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        expected = np.arange(0, 10).reshape((5, 2)), np.arange(5)
        for tpl in [z.first(), z[0].first(), z[0].first()]:
            assert_tuple_equal(tpl, expected)

        expected = np.arange(30, 40).reshape((5, 2)), np.arange(15, 20)
        for tpl in [z[3].first(), z[3].first(), z[-5].first()]:
            assert_tuple_equal(tpl, expected)

        expected = np.arange(70, 80).reshape((5, 2)), np.arange(35, 40)
        for tpl in [z[7].first(), z[7].first(), z[-1].first()]:
            assert_tuple_equal(tpl, expected)
コード例 #6
0
    def test_creation_from_zipped_rdd(self):
        x = np.arange(80).reshape((40, 2))
        y = range(40)
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        zipped_rdd = x_rdd.zip(y_rdd)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)))

        rdd = DictRDD(zipped_rdd)
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD(zipped_rdd, columns=('x', 'y'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD(zipped_rdd, dtype=(np.ndarray, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[1], list)
コード例 #7
0
    def test_auto_dtype(self):
        x = np.arange(80).reshape((40, 2))
        y = tuple(range(40))
        z = list(range(40))
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        z_rdd = self.sc.parallelize(z, 4)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)),
                    list(range(10)))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple}))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {'x': np.ndarray, 'y': tuple,
                                          'z': tuple}))
コード例 #8
0
ファイル: testing.py プロジェクト: ziwei-SUE/sparkit-learn
    def make_regression(self, n_targets, n_samples, blocks=-1):
        X, y = make_regression(n_targets=n_targets,
                               n_samples=n_samples,
                               n_features=20,
                               n_informative=10,
                               random_state=42)

        X_rdd = ArrayRDD(self.sc.parallelize(X))
        y_rdd = ArrayRDD(self.sc.parallelize(y))
        Z = DictRDD([X_rdd, y_rdd], columns=('X', 'y'), bsize=blocks)

        return X, y, Z
コード例 #9
0
    def test_transform_with_dtype(self):
        data1 = np.arange(400).reshape((100, 4))
        data2 = np.arange(200).reshape((100, 2))
        rdd1 = self.sc.parallelize(data1, 4)
        rdd2 = self.sc.parallelize(data2, 4)

        X = DictRDD(rdd1.zip(rdd2), bsize=5)

        X2 = X.transform(lambda x: x ** 2, column=0)
        assert_equal(X2.dtype, (np.ndarray, np.ndarray))

        X2 = X.transform(lambda x: tuple((x ** 2).tolist()), column=0,
                         dtype=tuple)
        assert_equal(X2.dtype, (tuple, np.ndarray))
        assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray}))

        X2 = X.transform(lambda x: x ** 2, column=1, dtype=list)
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda a, b: (a ** 2, (b ** 0.5).tolist()),
                         column=[0, 1], dtype=(np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda b, a: ((b ** 0.5).tolist(), a ** 2),
                         column=[1, 0], dtype=(list, np.ndarray))
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
コード例 #10
0
    def test_auto_dtype(self):
        x = np.arange(80).reshape((40, 2))
        y = tuple(range(40))
        z = list(range(40))
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        z_rdd = self.sc.parallelize(z, 4)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)),
                    list(range(10)))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple}))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(
            check_rdd_dtype(rdd, {
                'x': np.ndarray,
                'y': tuple,
                'z': tuple
            }))
コード例 #11
0
    def test_creation_from_blocked_rdds(self):
        x = np.arange(80).reshape((40, 2))
        y = np.arange(40)
        z = list(range(40))
        x_rdd = ArrayRDD(self.sc.parallelize(x, 4))
        y_rdd = ArrayRDD(self.sc.parallelize(y, 4))
        z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list)

        expected = (np.arange(20).reshape(10,
                                          2), np.arange(10), list(range(10)))
        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[2], list)
コード例 #12
0
    def test_get_single_item(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        assert_array_equal(z[0, 0].first(), np.arange(0, 10).reshape((5, 2)))
        assert_array_equal(z[0, 1].first(), np.arange(5))

        assert_array_equal(z[3, 0].first(), np.arange(30, 40).reshape((5, 2)))
        assert_array_equal(z[3, 1].first(), np.arange(15, 20))
        # assert_array_equal(z[3, -1].first(), np.arange(15, 20))

        assert_array_equal(z[7, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[-1, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[7, 1].first(), np.arange(35, 40))
コード例 #13
0
    def test_get_multiple_items(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)),
                    (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))]
        assert_array_equal(z[:2, 1].collect(),
                           [expected[0][1], expected[1][1]])
        assert_array_equal(z[[0, 1], 0].collect(),
                           [expected[0][0], expected[1][0]])
        assert_multiple_tuples_equal(z[[0, 1], [1]].collect(),
                                     [(expected[0][1], ), (expected[1][1], )])
        assert_multiple_tuples_equal(z[[0, 1], -1:].collect(),
                                     [(expected[0][1], ), (expected[1][1], )])
        assert_multiple_tuples_equal(z[[1, 0], [1, 0]].collect(),
                                     [expected[1][::-1], expected[0][::-1]])
コード例 #14
0
ファイル: testing.py プロジェクト: ziwei-SUE/sparkit-learn
    def make_classification(self,
                            n_classes,
                            n_samples,
                            blocks=-1,
                            nonnegative=False):
        X, y = make_classification(n_classes=n_classes,
                                   n_samples=n_samples,
                                   n_features=5,
                                   n_informative=4,
                                   n_redundant=0,
                                   n_clusters_per_class=1,
                                   random_state=42)
        if nonnegative:
            X = np.abs(X)

        X_rdd = ArrayRDD(self.sc.parallelize(X, 4))
        y_rdd = ArrayRDD(self.sc.parallelize(y, 4))
        Z = DictRDD([X_rdd, y_rdd], columns=('X', 'y'), bsize=blocks)

        return X, y, Z
コード例 #15
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_creation_from_blocked_rdds(self):
        x, y, z = np.arange(80).reshape((40, 2)), np.arange(40), range(40)
        x_rdd = ArrayRDD(self.sc.parallelize(x, 4))
        y_rdd = ArrayRDD(self.sc.parallelize(y, 4))
        z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list)

        expected = (np.arange(20).reshape(10, 2), np.arange(10), range(10))
        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[2], list)
コード例 #16
0
    def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100),
                  ((1e4, 100), None), ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X_dense, X_dense_rdd = self.make_dense_rdd()
            X_sparse, X_sparse_rdd = self.make_sparse_rdd()
            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

            local.fit(X_dense)
            dist.fit(X_dense_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            local.fit(X_sparse)
            dist.fit(X_sparse_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            dist.fit(Z)
            assert_array_almost_equal(local.variances_, dist.variances_)
コード例 #17
0
    def test_creation_from_zipped_rdd(self):
        x = np.arange(80).reshape((40, 2))
        y = range(40)
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        zipped_rdd = x_rdd.zip(y_rdd)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)))

        rdd = DictRDD(zipped_rdd)
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD(zipped_rdd, columns=('x', 'y'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD(zipped_rdd, dtype=(np.ndarray, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[1], list)
コード例 #18
0
    def test_initialization(self):
        n_partitions = 4
        n_samples = 100

        data = [(1, 2) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_raises(TypeError, DictRDD, data)
        assert_raises(TypeError, DictRDD, data, bsize=False)
        assert_raises(TypeError, DictRDD, data, bsize=10)

        assert_is_instance(DictRDD(rdd), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)
        assert_is_instance(DictRDD(rdd, bsize=10), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)
        assert_is_instance(DictRDD(rdd, bsize=None), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)
コード例 #19
0
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = dist.fit_transform(X_dense_rdd)
        assert_true(check_rdd_dtype(result_dist, (np.ndarray, )))
        assert_array_almost_equal(result_local, result_dist.toarray())

        result_local = local.fit_transform(X_sparse)
        result_dist = dist.fit_transform(X_sparse_rdd)
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())

        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
コード例 #20
0
ファイル: pipeline.py プロジェクト: ziwei-SUE/sparkit-learn
    def transform(self, Z):
        """TODO: rewrite docstring
        Transform X separately by each transformer, concatenate results.
        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Input data to be transformed.
        Returns
        -------
        X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        if isinstance(Z, DictRDD):
            X = Z[:, 'X']
        else:
            X = Z

        Zs = [
            _transform_one(trans, name, X, self.transformer_weights)
            for name, trans in self.transformer_list
        ]
        X_rdd = reduce(lambda x, y: x.zip(y._rdd), Zs)
        X_rdd = X_rdd.map(flatten)
        mapper = np.hstack
        for item in X_rdd.first():
            if sp.issparse(item):
                mapper = sp.hstack
        X_rdd = X_rdd.map(lambda x: mapper(x))

        if isinstance(Z, DictRDD):
            return DictRDD([X_rdd, Z[:, 'y']],
                           columns=Z.columns,
                           dtype=Z.dtype,
                           bsize=Z.bsize)
        else:
            return X_rdd
コード例 #21
0
    def test_creation_from_rdds(self):
        x = np.arange(80).reshape((40, 2))
        y = np.arange(40)
        z = list(range(40))
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        z_rdd = self.sc.parallelize(z, 4)

        expected = (
            np.arange(20).reshape(10, 2),
            np.arange(10), list(range(10))
        )
        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd],
                      dtype=(np.ndarray, np.ndarray, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[2], list)
コード例 #22
0
    def test_transform(self):
        data1 = np.arange(400).reshape((100, 4))
        data2 = np.arange(200).reshape((100, 2))
        rdd1 = self.sc.parallelize(data1, 4)
        rdd2 = self.sc.parallelize(data2, 4)

        X = DictRDD(rdd1.zip(rdd2), bsize=5)

        X1 = [(x[0], x[1]**2) for x in X.collect()]
        X2 = X.transform(lambda a, b: (a, b**2))
        assert_multiple_tuples_equal(X1, X2.collect())

        X1 = [(x[0], x[1]**2) for x in X.collect()]
        X2 = X.transform(lambda x: x**2, column=1)
        assert_multiple_tuples_equal(X1, X2.collect())

        X1 = [(x[0]**2, x[1]) for x in X.collect()]
        X2 = X.transform(lambda x: x**2, column=0)
        assert_multiple_tuples_equal(X1, X2.collect())

        X1 = [(x[0]**2, x[1]**0.5) for x in X.collect()]
        X2 = X.transform(lambda a, b: (a**2, b**0.5), column=[0, 1])
        assert_multiple_tuples_equal(X1, X2.collect())

        X1 = [(x[0]**2, x[1]**0.5) for x in X.collect()]
        X2 = X.transform(lambda b, a: (b**0.5, a**2), column=[1, 0])
        assert_multiple_tuples_equal(X1, X2.collect())
コード例 #23
0
ファイル: spsentiment.py プロジェクト: a23554/datamining
df = pd.read_csv("review.csv", header=None, encoding='latin1')
df[0] = df[0].apply(lambda death: 0 if death <= 5 else 1)
df = df.dropna()
data = df[1]
target = df[0]
list = []
data_train, data_test, target_train, target_test = cross_validation.train_test_split(
    data, target, test_size=0.25, random_state=43)

# train data toRDD
train_x = sc.parallelize(data_train)
train_y = sc.parallelize(target_train)
train_x = ArrayRDD(train_x)
train_y = ArrayRDD(train_y)
Z = DictRDD((train_x, train_y),
            columns=('X', 'y'),
            dtype=[np.ndarray, np.ndarray])

# pipeline
dist_pipeline = SparkPipeline((
    ('vect', SparkHashingVectorizer(non_negative=True)),  # hashingTF for NB
    ('tfidf', SparkTfidfTransformer()),  # IDF
    ('clf', SparkMultinomialNB(alpha=0.05))  # NB
))

# fit
dist_pipeline.fit(Z, clf__classes=np.array([0, 1]))

# test data to RDD
test_x = ArrayRDD(sc.parallelize(data_test))
test_y = ArrayRDD(sc.parallelize(target_test))
コード例 #24
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_transform(self):
        data1 = np.arange(400).reshape((100, 4))
        data2 = np.arange(200).reshape((100, 2))
        rdd1 = self.sc.parallelize(data1, 4)
        rdd2 = self.sc.parallelize(data2, 4)

        X = DictRDD(rdd1.zip(rdd2), bsize=5)

        X1 = [(x[0], x[1] ** 2) for x in X.collect()]
        X2 = X.transform(lambda a, b: (a, b ** 2)).collect()
        assert_multiple_tuples_equal(X1, X2)

        X1 = [(x[0], x[1] ** 2) for x in X.collect()]
        X2 = X.transform(lambda x: x ** 2, column=1).collect()
        assert_multiple_tuples_equal(X1, X2)

        X1 = [(x[0] ** 2, x[1]) for x in X.collect()]
        X2 = X.transform(lambda x: x ** 2, column=0).collect()
        assert_multiple_tuples_equal(X1, X2)

        X1 = [(x[0] ** 2, x[1] ** 0.5) for x in X.collect()]
        X2 = X.transform(lambda a, b: (a ** 2, b ** 0.5), column=[0, 1])
        assert_multiple_tuples_equal(X1, X2.collect())

        X1 = [(x[0] ** 2, x[1] ** 0.5) for x in X.collect()]
        X2 = X.transform(lambda b, a: (b ** 0.5, a ** 2), column=[1, 0])
        assert_multiple_tuples_equal(X1, X2.collect())